From 120392d9988952474c465725c8da4b448422f5e6 Mon Sep 17 00:00:00 2001 From: gt-on-1234 Date: Tue, 25 Oct 2022 22:40:44 +0100 Subject: [PATCH 1/3] Closes #49223 --- pandas/core/groupby/categorical.py | 2 +- pandas/core/groupby/grouper.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index b11bbf35312c9..777aad8391b60 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -53,7 +53,7 @@ def recode_for_groupby( unique_codes = unique1d(c.codes) take_codes = unique_codes[unique_codes != -1] - if c.ordered: + if sort: take_codes = np.sort(take_codes) # we recode according to the uniques diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 7ae6495f15541..9fef30d12043e 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -502,6 +502,15 @@ def __init__( self._group_index, ) = index._get_grouper_for_level(mapper, level=ilevel, dropna=dropna) + if is_categorical_dtype(self.grouping_vector): + self._passed_categorical = True + self._orig_cats = self.grouping_vector.categories + + # Should the sort arg be just `sort` or `sort or self.grouping_vector.ordered`? + self.grouping_vector, self._all_grouper = recode_for_groupby( + self.grouping_vector, sort, observed + ) + # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes elif isinstance(self.grouping_vector, Grouper): @@ -529,6 +538,8 @@ def __init__( self._passed_categorical = True self._orig_cats = self.grouping_vector.categories + + # Should the sort arg be just `sort` or `sort or self.grouping_vector.ordered`? self.grouping_vector, self._all_grouper = recode_for_groupby( self.grouping_vector, sort, observed ) From 3f9565586cd708dc6015669a154ad43b2d8f46a3 Mon Sep 17 00:00:00 2001 From: gt-on-1234 Date: Tue, 25 Oct 2022 23:27:25 +0100 Subject: [PATCH 2/3] Adds doc in v2.0.0.rst and adds test_categorical_vs_range_index_sorting --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/tests/groupby/test_categorical.py | 31 ++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 0f8afe14a2369..a711e31c204ca 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -277,6 +277,7 @@ Categorical ^^^^^^^^^^^ - Bug in :meth:`Categorical.set_categories` losing dtype information (:issue:`48812`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`) +- Bug in :class:`GroupBy` when a categorical column was used as a grouper with a range index, ordering of the result would depend on the `sort` argument but if the index was categorical it would depend on the `order` attribute of the index (:issue:`49223`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 092fd4a4d6be0..aae5eed8e44c3 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -2046,3 +2046,34 @@ def test_many_categories(as_index, sort, index_kind, ordered): expected = DataFrame({"a": Series(index), "b": data}) tm.assert_frame_equal(result, expected) + + +def test_categorical_vs_range_index_sorting(): + categories = np.arange(4, -1, -1) + for range_index in [True, False]: + df_ordered = DataFrame( + { + "a": Categorical([2, 1, 2, 3], categories=categories, ordered=True), + "b": range(4) + } + ) + df_unordered = DataFrame( + { + "a": Categorical([2, 1, 2, 3], categories=categories, ordered=False), + "b": range(4) + } + ) + + if not range_index: + df_ordered = df_ordered.set_index('a') + df_unordered = df_unordered.set_index('a') + + gb_ordered_sort = df_ordered.groupby("a", sort=True, observed=True) + gb_ordered_nosort = df_ordered.groupby("a", sort=False, observed=True) + gb_unordered_sort = df_unordered.groupby("a", sort=True, observed=True) + gb_unordered_nosort = df_unordered.groupby("a", sort=False, observed=True) + + assert gb_ordered_sort.sum()["b"].tolist() == [3, 2, 1] + assert gb_ordered_nosort.sum()["b"].tolist() == [2, 1, 3] + assert gb_unordered_sort.sum()["b"].tolist() == [3, 2, 1] + assert gb_unordered_nosort.sum()["b"].tolist() == [2, 1, 3] \ No newline at end of file From 1ef00bbb21a82cf9088a711804f411161c1bbd00 Mon Sep 17 00:00:00 2001 From: gt-on-1234 Date: Wed, 26 Oct 2022 19:05:22 +0100 Subject: [PATCH 3/3] typing and formatting --- pandas/core/groupby/grouper.py | 2 ++ pandas/tests/groupby/test_categorical.py | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 9fef30d12043e..6e831d7a45fd0 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -503,6 +503,8 @@ def __init__( ) = index._get_grouper_for_level(mapper, level=ilevel, dropna=dropna) if is_categorical_dtype(self.grouping_vector): + self.grouping_vector: Categorical + self._passed_categorical = True self._orig_cats = self.grouping_vector.categories diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index aae5eed8e44c3..3402cd6fc5357 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -2054,19 +2054,19 @@ def test_categorical_vs_range_index_sorting(): df_ordered = DataFrame( { "a": Categorical([2, 1, 2, 3], categories=categories, ordered=True), - "b": range(4) + "b": range(4), } ) df_unordered = DataFrame( { "a": Categorical([2, 1, 2, 3], categories=categories, ordered=False), - "b": range(4) + "b": range(4), } ) if not range_index: - df_ordered = df_ordered.set_index('a') - df_unordered = df_unordered.set_index('a') + df_ordered = df_ordered.set_index("a") + df_unordered = df_unordered.set_index("a") gb_ordered_sort = df_ordered.groupby("a", sort=True, observed=True) gb_ordered_nosort = df_ordered.groupby("a", sort=False, observed=True) @@ -2076,4 +2076,4 @@ def test_categorical_vs_range_index_sorting(): assert gb_ordered_sort.sum()["b"].tolist() == [3, 2, 1] assert gb_ordered_nosort.sum()["b"].tolist() == [2, 1, 3] assert gb_unordered_sort.sum()["b"].tolist() == [3, 2, 1] - assert gb_unordered_nosort.sum()["b"].tolist() == [2, 1, 3] \ No newline at end of file + assert gb_unordered_nosort.sum()["b"].tolist() == [2, 1, 3]