From 85056a48c0cfb345be46a3bfe3606ba8c6462906 Mon Sep 17 00:00:00 2001 From: Yao Xiao Date: Wed, 3 May 2023 06:57:13 +0000 Subject: [PATCH 1/7] fixed groupby quantile index.levels --- pandas/core/groupby/groupby.py | 5 ++++- pandas/tests/groupby/test_quantile.py | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e26b4779ef7fa..bd5a77901c6b3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4317,5 +4317,8 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))] mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None]) else: - mi = MultiIndex.from_product([idx, qs]) + lev_codes, lev = Index(qs).factorize() + levels = [idx, lev] + codes = [np.repeat(range(len(idx)), nqs), np.tile(lev_codes, len(idx))] + mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) return mi diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 5801223c0e718..a116f64f39812 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -471,3 +471,27 @@ def test_groupby_quantile_dt64tz_period(): expected.index = expected.index.astype(np.int_) tm.assert_frame_equal(result, expected) + + +def test_groupby_quantile_nonmulti_levels_order(): + # Non-regression test for GH #53009 + ind = pd.MultiIndex.from_tuples( + [ + (0, "a", "B"), + (0, "a", "A"), + (0, "b", "B"), + (0, "b", "A"), + (1, "a", "B"), + (1, "a", "A"), + (1, "b", "B"), + (1, "b", "A"), + ], + names=["sample", "cat0", "cat1"], + ) + ds = pd.Series(range(8), index=ind) + + s1 = ds.groupby(level="cat1", sort=False).quantile([0.2, 0.8]) + + # Check that levels are not sorted + expected = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]]) + tm.assert_equal(s1.index.levels, expected) From 212b86bde85b9b7da13cc5d49269ea62fe40bcb2 Mon Sep 17 00:00:00 2001 From: Yao Xiao Date: Wed, 3 May 2023 07:06:07 +0000 Subject: [PATCH 2/7] changelog added --- doc/source/whatsnew/v2.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 84e011bf80da6..41a52cd2715c1 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -401,6 +401,7 @@ Groupby/resample/rolling the function operated on the whole index rather than each element of the index. (:issue:`51979`) - Bug in :meth:`DataFrameGroupBy.apply` causing an error to be raised when the input :class:`DataFrame` was subset as a :class:`DataFrame` after groupby (``[['a']]`` and not ``['a']``) and the given callable returned :class:`Series` that were not all indexed the same. (:issue:`52444`) - Bug in :meth:`GroupBy.groups` with a datetime key in conjunction with another key produced incorrect number of group keys (:issue:`51158`) +- Bug in :meth:`GroupBy.quantile` may implicitly sort ``index.levels`` (:issue:`53009`) - Bug in :meth:`GroupBy.var` failing to raise ``TypeError`` when called with datetime64 or :class:`PeriodDtype` values (:issue:`52128`) - From 4456efda7b9e1e7fd08c852055ace219fcf969cf Mon Sep 17 00:00:00 2001 From: Yao Xiao Date: Thu, 4 May 2023 03:30:24 +0000 Subject: [PATCH 3/7] updated changelog --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 41a52cd2715c1..0977c370500e0 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -401,7 +401,7 @@ Groupby/resample/rolling the function operated on the whole index rather than each element of the index. (:issue:`51979`) - Bug in :meth:`DataFrameGroupBy.apply` causing an error to be raised when the input :class:`DataFrame` was subset as a :class:`DataFrame` after groupby (``[['a']]`` and not ``['a']``) and the given callable returned :class:`Series` that were not all indexed the same. (:issue:`52444`) - Bug in :meth:`GroupBy.groups` with a datetime key in conjunction with another key produced incorrect number of group keys (:issue:`51158`) -- Bug in :meth:`GroupBy.quantile` may implicitly sort ``index.levels`` (:issue:`53009`) +- Bug in :meth:`GroupBy.quantile` may implicitly sort the result index with ``sort=False`` (:issue:`53009`) - Bug in :meth:`GroupBy.var` failing to raise ``TypeError`` when called with datetime64 or :class:`PeriodDtype` values (:issue:`52128`) - From 499d39330b6e7307e9d40ee1edf9d35b5d1abb5d Mon Sep 17 00:00:00 2001 From: Yao Xiao Date: Thu, 4 May 2023 04:44:29 +0000 Subject: [PATCH 4/7] modified test to test whole frame - not sure but I think it is not checking index levels like this --- pandas/tests/groupby/test_quantile.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index a116f64f39812..f709550b0a98d 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -488,10 +488,15 @@ def test_groupby_quantile_nonmulti_levels_order(): ], names=["sample", "cat0", "cat1"], ) - ds = pd.Series(range(8), index=ind) - - s1 = ds.groupby(level="cat1", sort=False).quantile([0.2, 0.8]) + ser = pd.Series(range(8), index=ind) + result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8]) + + # We need to check that index levels are not sorted + qind = pd.MultiIndex( + [["B", "A"], [0.2, 0.8]], + [[0, 0, 1, 1], [0, 1, 0, 1]], + names=["cat1", None], + ) + expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind) - # Check that levels are not sorted - expected = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]]) - tm.assert_equal(s1.index.levels, expected) + tm.assert_series_equal(result, expected) From 518d8ae18c4e4d6e2498b8fa81cef8b4fd431a18 Mon Sep 17 00:00:00 2001 From: Yao Xiao Date: Fri, 5 May 2023 04:36:33 +0000 Subject: [PATCH 5/7] updated tests to check index levels separately --- pandas/tests/groupby/test_quantile.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index f709550b0a98d..7bf168944a1ac 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -491,12 +491,13 @@ def test_groupby_quantile_nonmulti_levels_order(): ser = pd.Series(range(8), index=ind) result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8]) - # We need to check that index levels are not sorted - qind = pd.MultiIndex( - [["B", "A"], [0.2, 0.8]], - [[0, 0, 1, 1], [0, 1, 0, 1]], - names=["cat1", None], + qind = pd.MultiIndex.from_tuples( + [("B", 0.2), ("B", 0.8), ("A", 0.2), ("A", 0.8)], names=["cat1", None] ) expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind) tm.assert_series_equal(result, expected) + + # We need to check that index levels are not sorted + expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]]) + tm.assert_equal(result.index.levels, expected_levels) From edaa681d6eb685b3eef3e9ddd0141feae934c443 Mon Sep 17 00:00:00 2001 From: Yao Xiao Date: Sun, 7 May 2023 14:09:44 +0000 Subject: [PATCH 6/7] avoid performance regression when not multiindex --- pandas/core/groupby/groupby.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 395fa33b8c327..15e1fc1e85cc4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -70,7 +70,10 @@ class providing the base-class of operations. ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.cast import ensure_dtype_can_hold_na +from pandas.core.dtypes.cast import ( + coerce_indexer_dtype, + ensure_dtype_can_hold_na, +) from pandas.core.dtypes.common import ( is_bool_dtype, is_float_dtype, @@ -4317,8 +4320,11 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))] mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None]) else: + nidx = len(idx) lev_codes, lev = Index(qs).factorize() + lev_codes = coerce_indexer_dtype(lev_codes, lev) + idx_codes = coerce_indexer_dtype(np.arange(nidx), idx) levels = [idx, lev] - codes = [np.repeat(range(len(idx)), nqs), np.tile(lev_codes, len(idx))] + codes = [np.repeat(idx_codes, nqs), np.tile(lev_codes, nidx)] mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) return mi From ef2be54ab0d3cee1deb7e2fae386c760200331e9 Mon Sep 17 00:00:00 2001 From: Yao Xiao Date: Tue, 9 May 2023 23:13:13 +0000 Subject: [PATCH 7/7] tried to combine code blocks --- pandas/core/groupby/groupby.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 75019a393514b..5928c32e22b7f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4312,19 +4312,19 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde MultiIndex """ nqs = len(qs) + lev_codes, lev = Index(qs).factorize() + lev_codes = coerce_indexer_dtype(lev_codes, lev) if idx._is_multi: idx = cast(MultiIndex, idx) - lev_codes, lev = Index(qs).factorize() levels = list(idx.levels) + [lev] codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))] mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None]) else: nidx = len(idx) - lev_codes, lev = Index(qs).factorize() - lev_codes = coerce_indexer_dtype(lev_codes, lev) idx_codes = coerce_indexer_dtype(np.arange(nidx), idx) levels = [idx, lev] codes = [np.repeat(idx_codes, nqs), np.tile(lev_codes, nidx)] mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) + return mi