From b021bb2cdc99d360bb59b608425892218596d198 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 27 Jan 2023 16:49:02 -0800 Subject: [PATCH 1/6] BUG: GroupBy.min/max with unordered Categorical and no groups --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/groupby/ops.py | 15 +++--- pandas/tests/groupby/test_groupby.py | 81 +++++++++++++--------------- 3 files changed, 45 insertions(+), 52 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index e90717efc10b5..08890018804e2 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -958,6 +958,7 @@ Categorical - Bug in :meth:`Series.replace` with categorical dtype losing nullable dtypes of underlying categories (:issue:`49404`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`) - Bug in :class:`Categorical` constructor when constructing from a :class:`Categorical` object and ``dtype="category"`` losing ordered-ness (:issue:`49309`) +- Bug in :meth:`SeriesGroupBy.min`, :meth:`SeriesGroupBy.max`, :meth:`DataFrameGroupBy.min`, and :meth:`DataFrameGroupBy.max` with unordered :class:`CategoricalDtype` with no groups failing to raise ``TypeError`` (:issue:`??`) - Datetimelike diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ced3190206f37..6aa52679c6526 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -227,9 +227,10 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): Raises ------ + TypeError + This is not a valid operation for this dtype. NotImplementedError - This is either not a valid function for this dtype, or - valid but not implemented in cython. + This may be a valid operation, but does not have a cython implementation. """ how = self.how @@ -238,16 +239,16 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): return if isinstance(dtype, CategoricalDtype): - # NotImplementedError for methods that can fall back to a - # non-cython implementation. if how in ["sum", "prod", "cumsum", "cumprod"]: raise TypeError(f"{dtype} type does not support {how} operations") + if how in ["min", "max", "rank"] and not dtype.ordered: + # raise TypeError instead of NotImplementedError to ensure we + # don't go down a group-by-group path, since in the empty-groups + # case that would fail to raise + raise TypeError(f"Cannot perform {how} with non-ordered Categorical") if how not in ["rank"]: # only "rank" is implemented in cython raise NotImplementedError(f"{dtype} dtype not supported") - if not dtype.ordered: - # TODO: TypeError? - raise NotImplementedError(f"{dtype} dtype not supported") elif is_sparse(dtype): raise NotImplementedError(f"{dtype} dtype not supported") diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e9b18836e003c..5a41b3bdbdc7b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1901,25 +1901,12 @@ def test_empty_groupby( raises=ValueError, match="attempt to get arg(min|max) of an empty sequence" ) request.node.add_marker(mark) - elif ( - isinstance(values, Categorical) - and len(keys) == 1 - and not isinstance(columns, list) - ): - mark = pytest.mark.xfail( - raises=TypeError, match="'Categorical' does not implement" - ) - request.node.add_marker(mark) elif isinstance(values, Categorical) and len(keys) == 1 and op in ["sum", "prod"]: mark = pytest.mark.xfail( raises=AssertionError, match="(DataFrame|Series) are different" ) request.node.add_marker(mark) - elif ( - isinstance(values, Categorical) - and len(keys) == 2 - and op in ["min", "max", "sum"] - ): + elif isinstance(values, Categorical) and len(keys) == 2 and op in ["sum"]: mark = pytest.mark.xfail( raises=AssertionError, match="(DataFrame|Series) are different" ) @@ -1949,6 +1936,31 @@ def get_result(**kwargs): else: return getattr(gb, method)(op, **kwargs) + if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]: + msg = f"Cannot perform {op} with non-ordered Categorical" + with pytest.raises(TypeError, match=msg): + get_result() + + if isinstance(columns, list): + # i.e. DataframeGroupBy, not SeriesGroupBy + result = get_result(numeric_only=True) + + # Categorical is special without 'observed=True', we get an NaN entry + # corresponding to the unobserved group. If we passed observed=True + # to groupby, expected would just be 'df.set_index(keys)[columns]' + # as below + lev = Categorical([0], dtype=values.dtype) + if len(keys) != 1: + idx = MultiIndex.from_product([lev, lev], names=keys) + else: + # all columns are dropped, but we end up with one row + # Categorical is special without 'observed=True' + idx = Index(lev, name=keys[0]) + + expected = DataFrame([], columns=[], index=idx) + tm.assert_equal(result, expected) + return + if columns == "C": # i.e. SeriesGroupBy if op in ["prod", "sum", "skew"]: @@ -2018,38 +2030,17 @@ def get_result(**kwargs): tm.assert_equal(result, expected) return - if (op in ["min", "max", "skew"] and isinstance(values, Categorical)) or ( - op == "skew" and df.dtypes[0].kind == "M" + if op == "skew" and ( + isinstance(values, Categorical) or df.dtypes[0].kind == "M" ): - if op == "skew" or len(keys) == 1: - msg = "|".join( - [ - "Categorical is not ordered", - "does not support reduction", - ] - ) - with pytest.raises(TypeError, match=msg): - get_result() - return - # Categorical doesn't implement, so with numeric_only=True - # these are dropped and we get an empty DataFrame back - result = get_result() - - # with numeric_only=True, these are dropped, and we get - # an empty DataFrame back - if len(keys) != 1: - # Categorical is special without 'observed=True' - lev = Categorical([0], dtype=values.dtype) - mi = MultiIndex.from_product([lev, lev], names=keys) - expected = DataFrame([], columns=[], index=mi) - else: - # all columns are dropped, but we end up with one row - # Categorical is special without 'observed=True' - lev = Categorical([0], dtype=values.dtype) - ci = Index(lev, name=keys[0]) - expected = DataFrame([], columns=[], index=ci) - - tm.assert_equal(result, expected) + msg = "|".join( + [ + "Categorical is not ordered", + "does not support reduction", + ] + ) + with pytest.raises(TypeError, match=msg): + get_result() return result = get_result() From 9225fc75525fe3d8b26673aec4edccdfceaa940a Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 27 Jan 2023 16:50:38 -0800 Subject: [PATCH 2/6] GH ref --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 08890018804e2..3e465e1d7c07b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -958,7 +958,7 @@ Categorical - Bug in :meth:`Series.replace` with categorical dtype losing nullable dtypes of underlying categories (:issue:`49404`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`) - Bug in :class:`Categorical` constructor when constructing from a :class:`Categorical` object and ``dtype="category"`` losing ordered-ness (:issue:`49309`) -- Bug in :meth:`SeriesGroupBy.min`, :meth:`SeriesGroupBy.max`, :meth:`DataFrameGroupBy.min`, and :meth:`DataFrameGroupBy.max` with unordered :class:`CategoricalDtype` with no groups failing to raise ``TypeError`` (:issue:`??`) +- Bug in :meth:`SeriesGroupBy.min`, :meth:`SeriesGroupBy.max`, :meth:`DataFrameGroupBy.min`, and :meth:`DataFrameGroupBy.max` with unordered :class:`CategoricalDtype` with no groups failing to raise ``TypeError`` (:issue:`51034`) - Datetimelike From 203431e6c0ec7bb81fcd53e7724d6db57f55563b Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 27 Jan 2023 19:09:04 -0800 Subject: [PATCH 3/6] update test --- pandas/tests/groupby/test_function.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 1e16e353cc1a4..1ef97fed7ba05 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -250,6 +250,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): [ "Categorical is not ordered", "function is not implemented for this dtype", + f"Cannot perform {method} with non-ordered Categorical", ] ) with pytest.raises(exception, match=msg): @@ -276,6 +277,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "category type does not support", "can't multiply sequence", "function is not implemented for this dtype", + f"Cannot perform {method} with non-ordered Categorical", ] ) with pytest.raises(exception, match=msg): From 29377a5afd22cdda7652543071ecc6b5c0670f29 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 30 Jan 2023 12:33:05 -0800 Subject: [PATCH 4/6] test for non-ordered Categorical rank --- pandas/tests/groupby/test_rank.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 8bbe38d3379ac..d0b848a567346 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -13,6 +13,23 @@ import pandas._testing as tm +def test_rank_unordered_categorical_typeerror(): + # GH#51034 should be TypeError, not NotImplementedError + cat = pd.Categorical([], ordered=False) + ser = Series(cat) + df = ser.to_frame() + + msg = "Cannot perform rank with non-ordered Categorical" + + gb = ser.groupby(cat) + with pytest.raises(TypeError, match=msg): + gb.rank() + + gb2 = df.groupby(cat) + with pytest.raises(TypeError, match=msg): + gb2.rank() + + def test_rank_apply(): lev1 = tm.rands_array(10, 100) lev2 = tm.rands_array(10, 130) From 5598a7f1744f3819e097240f6b5c5a28f3b05cd0 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 30 Jan 2023 13:11:08 -0800 Subject: [PATCH 5/6] test period case, fix most xfails --- pandas/tests/groupby/test_groupby.py | 112 ++++++++++----------------- 1 file changed, 39 insertions(+), 73 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5a41b3bdbdc7b..8c00bff291fd2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1858,6 +1858,7 @@ def test_pivot_table_values_key_error(): Categorical([0]), [to_datetime(0)], date_range(0, 1, 1, tz="US/Eastern"), + pd.period_range("2016-01-01", periods=3, freq="D"), pd.array([0], dtype="Int64"), pd.array([0], dtype="Float64"), pd.array([False], dtype="boolean"), @@ -1870,6 +1871,7 @@ def test_pivot_table_values_key_error(): "cat", "dt64", "dt64tz", + "period", "Int64", "Float64", "boolean", @@ -1886,13 +1888,6 @@ def test_empty_groupby( override_dtype = None if ( - isinstance(values, Categorical) - and not isinstance(columns, list) - and op in ["sum", "prod", "skew"] - ): - # handled below GH#41291 - pass - elif ( isinstance(values, Categorical) and len(keys) == 1 and op in ["idxmax", "idxmin"] @@ -1901,18 +1896,8 @@ def test_empty_groupby( raises=ValueError, match="attempt to get arg(min|max) of an empty sequence" ) request.node.add_marker(mark) - elif isinstance(values, Categorical) and len(keys) == 1 and op in ["sum", "prod"]: - mark = pytest.mark.xfail( - raises=AssertionError, match="(DataFrame|Series) are different" - ) - request.node.add_marker(mark) - elif isinstance(values, Categorical) and len(keys) == 2 and op in ["sum"]: - mark = pytest.mark.xfail( - raises=AssertionError, match="(DataFrame|Series) are different" - ) - request.node.add_marker(mark) - elif isinstance(values, BooleanArray) and op in ["sum", "prod"]: + if isinstance(values, BooleanArray) and op in ["sum", "prod"]: # We expect to get Int64 back for these override_dtype = "Int64" @@ -1936,6 +1921,26 @@ def get_result(**kwargs): else: return getattr(gb, method)(op, **kwargs) + def get_categorical_invalid_expected(): + # Categorical is special without 'observed=True', we get an NaN entry + # corresponding to the unobserved group. If we passed observed=True + # to groupby, expected would just be 'df.set_index(keys)[columns]' + # as below + lev = Categorical([0], dtype=values.dtype) + if len(keys) != 1: + idx = MultiIndex.from_product([lev, lev], names=keys) + else: + # all columns are dropped, but we end up with one row + # Categorical is special without 'observed=True' + idx = Index(lev, name=keys[0]) + + expected = DataFrame([], columns=[], index=idx) + return expected + + is_per = isinstance(df.dtypes[0], pd.PeriodDtype) + is_dt64 = df.dtypes[0].kind == "M" + is_cat = isinstance(values, Categorical) + if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]: msg = f"Cannot perform {op} with non-ordered Categorical" with pytest.raises(TypeError, match=msg): @@ -1944,20 +1949,7 @@ def get_result(**kwargs): if isinstance(columns, list): # i.e. DataframeGroupBy, not SeriesGroupBy result = get_result(numeric_only=True) - - # Categorical is special without 'observed=True', we get an NaN entry - # corresponding to the unobserved group. If we passed observed=True - # to groupby, expected would just be 'df.set_index(keys)[columns]' - # as below - lev = Categorical([0], dtype=values.dtype) - if len(keys) != 1: - idx = MultiIndex.from_product([lev, lev], names=keys) - else: - # all columns are dropped, but we end up with one row - # Categorical is special without 'observed=True' - idx = Index(lev, name=keys[0]) - - expected = DataFrame([], columns=[], index=idx) + expected = get_categorical_invalid_expected() tm.assert_equal(result, expected) return @@ -1965,22 +1957,15 @@ def get_result(**kwargs): # i.e. SeriesGroupBy if op in ["prod", "sum", "skew"]: # ops that require more than just ordered-ness - if df.dtypes[0].kind == "M": + if is_dt64 or is_cat or is_per: # GH#41291 # datetime64 -> prod and sum are invalid if op == "skew": msg = "does not support reduction 'skew'" - else: + elif is_dt64: msg = "datetime64 type does not support" - with pytest.raises(TypeError, match=msg): - get_result() - - return - if op in ["prod", "sum", "skew"]: - if isinstance(values, Categorical): - # GH#41291 - if op == "skew": - msg = f"does not support reduction '{op}'" + elif is_per: + msg = "Period type does not support" else: msg = "category type does not support" with pytest.raises(TypeError, match=msg): @@ -1991,48 +1976,29 @@ def get_result(**kwargs): # ie. DataFrameGroupBy if op in ["prod", "sum"]: # ops that require more than just ordered-ness - if df.dtypes[0].kind == "M": + if is_dt64 or is_per or is_cat: # GH#41291 # datetime64 -> prod and sum are invalid - with pytest.raises(TypeError, match="datetime64 type does not support"): - get_result() - result = get_result(numeric_only=True) - - # with numeric_only=True, these are dropped, and we get - # an empty DataFrame back - expected = df.set_index(keys)[[]] - tm.assert_equal(result, expected) - return + if is_dt64: + msg = "datetime64 type does not support" + elif is_per: + msg = "Period type does not support" + else: + msg = "category type does not support" - elif isinstance(values, Categorical): - # GH#41291 - # Categorical doesn't implement sum or prod - with pytest.raises(TypeError, match="category type does not support"): + with pytest.raises(TypeError, match=msg): get_result() result = get_result(numeric_only=True) # with numeric_only=True, these are dropped, and we get # an empty DataFrame back expected = df.set_index(keys)[[]] - if len(keys) != 1 and op == "prod": - # TODO: why just prod and not sum? - # Categorical is special without 'observed=True' - lev = Categorical([0], dtype=values.dtype) - mi = MultiIndex.from_product([lev, lev], names=["A", "B"]) - expected = DataFrame([], columns=[], index=mi) - - tm.assert_equal(result, expected) - return - - elif df.dtypes[0] == object: - result = get_result() - expected = df.set_index(keys)[["C"]] + if is_cat: + expected = get_categorical_invalid_expected() tm.assert_equal(result, expected) return - if op == "skew" and ( - isinstance(values, Categorical) or df.dtypes[0].kind == "M" - ): + if op == "skew" and (is_cat or is_dt64 or is_per): msg = "|".join( [ "Categorical is not ordered", From 346dd91c433516f46925cf8390d724db2f571cd3 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 30 Jan 2023 13:15:10 -0800 Subject: [PATCH 6/6] simplify --- pandas/tests/groupby/test_groupby.py | 67 ++++++++++------------------ 1 file changed, 24 insertions(+), 43 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8c00bff291fd2..2c6d386a36e78 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1953,41 +1953,33 @@ def get_categorical_invalid_expected(): tm.assert_equal(result, expected) return - if columns == "C": - # i.e. SeriesGroupBy - if op in ["prod", "sum", "skew"]: - # ops that require more than just ordered-ness - if is_dt64 or is_cat or is_per: - # GH#41291 - # datetime64 -> prod and sum are invalid - if op == "skew": - msg = "does not support reduction 'skew'" - elif is_dt64: - msg = "datetime64 type does not support" - elif is_per: - msg = "Period type does not support" - else: - msg = "category type does not support" - with pytest.raises(TypeError, match=msg): - get_result() + if op in ["prod", "sum", "skew"]: + # ops that require more than just ordered-ness + if is_dt64 or is_cat or is_per: + # GH#41291 + # datetime64 -> prod and sum are invalid + if op == "skew": + msg = "does not support reduction 'skew'" + elif is_dt64: + msg = "datetime64 type does not support" + elif is_per: + msg = "Period type does not support" + else: + msg = "category type does not support" + with pytest.raises(TypeError, match=msg): + get_result() + if not isinstance(columns, list): + # i.e. SeriesGroupBy return - else: - # ie. DataFrameGroupBy - if op in ["prod", "sum"]: - # ops that require more than just ordered-ness - if is_dt64 or is_per or is_cat: + elif op == "skew": + # TODO: test the numeric_only=True case + return + else: + # i.e. op in ["prod", "sum"]: + # i.e. DataFrameGroupBy + # ops that require more than just ordered-ness # GH#41291 - # datetime64 -> prod and sum are invalid - if is_dt64: - msg = "datetime64 type does not support" - elif is_per: - msg = "Period type does not support" - else: - msg = "category type does not support" - - with pytest.raises(TypeError, match=msg): - get_result() result = get_result(numeric_only=True) # with numeric_only=True, these are dropped, and we get @@ -1998,17 +1990,6 @@ def get_categorical_invalid_expected(): tm.assert_equal(result, expected) return - if op == "skew" and (is_cat or is_dt64 or is_per): - msg = "|".join( - [ - "Categorical is not ordered", - "does not support reduction", - ] - ) - with pytest.raises(TypeError, match=msg): - get_result() - return - result = get_result() expected = df.set_index(keys)[columns] if override_dtype is not None: