diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 6bbaa934eaa12..92e7c574c87be 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -851,6 +851,7 @@ Groupby/resample/rolling - Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` computing wrong result with nullable data types too large to roundtrip when casting to float (:issue:`37493`) - Bug in :meth:`DataFrame.rolling` returning mean zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`) - Bug in :meth:`DataFrame.rolling` returning sum not zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`) +- Bug in :meth:`SeriesGroupBy.agg` failing to retain ordered :class:`CategoricalDtype` on order-preserving aggregations (:issue:`41147`) - Bug in :meth:`DataFrameGroupBy.min` and :meth:`DataFrameGroupBy.max` with multiple object-dtype columns and ``numeric_only=False`` incorrectly raising ``ValueError`` (:issue:41111`) Reshaping diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 771ba2845db6e..6828adae02e06 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -358,9 +358,26 @@ def _cython_agg_general( if numeric_only and not is_numeric: continue - result = self.grouper._cython_operation( - "aggregate", obj._values, how, axis=0, min_count=min_count - ) + objvals = obj._values + + if isinstance(objvals, Categorical): + if self.grouper.ngroups > 0: + # without special-casing, we would raise, then in fallback + # would eventually call agg_series but without re-casting + # to Categorical + # equiv: res_values, _ = self.grouper.agg_series(obj, alt) + res_values, _ = self.grouper._aggregate_series_pure_python(obj, alt) + else: + # equiv: res_values = self._python_agg_general(alt) + res_values = self._python_apply_general(alt, self._selected_obj) + + result = type(objvals)._from_sequence(res_values, dtype=objvals.dtype) + + else: + result = self.grouper._cython_operation( + "aggregate", obj._values, how, axis=0, min_count=min_count + ) + assert result.ndim == 1 key = base.OutputKey(label=name, position=idx) output[key] = result @@ -1092,13 +1109,7 @@ def cast_agg_result(result: ArrayLike, values: ArrayLike) -> ArrayLike: # see if we can cast the values to the desired dtype # this may not be the original dtype - if isinstance(values, Categorical) and isinstance(result, np.ndarray): - # If the Categorical op didn't raise, it is dtype-preserving - # We get here with how="first", "last", "min", "max" - result = type(values)._from_sequence(result.ravel(), dtype=values.dtype) - # Note this will have result.dtype == dtype from above - - elif ( + if ( not using_array_manager and isinstance(result.dtype, np.dtype) and result.ndim == 1 @@ -1140,9 +1151,14 @@ def py_fallback(values: ArrayLike) -> ArrayLike: # Categoricals. This will done by later self._reindex_output() # Doing it here creates an error. See GH#34951 sgb = get_groupby(obj, self.grouper, observed=True) + # Note: bc obj is always a Series here, we can ignore axis and pass # `alt` directly instead of `lambda x: alt(x, axis=self.axis)` - res_ser = sgb.aggregate(alt) # this will go through sgb._python_agg_general + # use _agg_general bc it will go through _cython_agg_general + # which will correctly cast Categoricals. + res_ser = sgb._agg_general( + numeric_only=False, min_count=min_count, alias=how, npfunc=alt + ) # unwrap Series to get array res_values = res_ser._mgr.arrays[0] diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 28344897a686f..c61360e67e505 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1105,6 +1105,11 @@ def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): expected_df = DataFrame(data=exp_data, index=cat_index) + if "cat_ord" in expected_df: + # ordered categorical columns should be preserved + dtype = input_df["cat_ord"].dtype + expected_df["cat_ord"] = expected_df["cat_ord"].astype(dtype) + tm.assert_frame_equal(result_df, expected_df) @@ -1149,6 +1154,10 @@ def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): multi_index = MultiIndex.from_tuples(tuple(multi_index_list)) expected_df = DataFrame(data=exp_data, columns=multi_index, index=cat_index) + for col in expected_df.columns: + if isinstance(col, tuple) and "cat_ord" in col: + # ordered categorical should be preserved + expected_df[col] = expected_df[col].astype(input_df["cat_ord"].dtype) tm.assert_frame_equal(result_df, expected_df) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index da438826a939a..0509b3e165bcb 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -800,6 +800,12 @@ def test_preserve_on_ordered_ops(func, values): ).set_index("payload") tm.assert_frame_equal(result, expected) + # we should also preserve categorical for SeriesGroupBy + sgb = df.groupby("payload")["col"] + result = getattr(sgb, func)() + expected = expected["col"] + tm.assert_series_equal(result, expected) + def test_categorical_no_compress(): data = Series(np.random.randn(9)) @@ -1494,7 +1500,11 @@ def test_groupy_first_returned_categorical_instead_of_dataframe(func): df = DataFrame({"A": [1997], "B": Series(["b"], dtype="category").cat.as_ordered()}) df_grouped = df.groupby("A")["B"] result = getattr(df_grouped, func)() - expected = Series(["b"], index=Index([1997], name="A"), name="B") + + # ordered categorical dtype should be preserved + expected = Series( + ["b"], index=Index([1997], name="A"), name="B", dtype=df["B"].dtype + ) tm.assert_series_equal(result, expected) @@ -1561,7 +1571,15 @@ def test_agg_cython_category_not_implemented_fallback(): df["col_cat"] = df["col_num"].astype("category") result = df.groupby("col_num").col_cat.first() - expected = Series([1, 2, 3], index=Index([1, 2, 3], name="col_num"), name="col_cat") + + # ordered categorical dtype should definitely be preserved; + # this is unordered, so is less-clear case (if anything, it should raise) + expected = Series( + [1, 2, 3], + index=Index([1, 2, 3], name="col_num"), + name="col_cat", + dtype=df["col_cat"].dtype, + ) tm.assert_series_equal(result, expected) result = df.groupby("col_num").agg({"col_cat": "first"}) @@ -1576,6 +1594,10 @@ def test_aggregate_categorical_lost_index(func: str): df = DataFrame({"A": [1997], "B": ds}) result = df.groupby("A").agg({"B": func}) expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A")) + + # ordered categorical dtype should be preserved + expected["B"] = expected["B"].astype(ds.dtype) + tm.assert_frame_equal(result, expected) @@ -1653,6 +1675,9 @@ def test_categorical_transform(): expected["status"] = expected["status"].astype(delivery_status_type) + # .transform(max) should preserve ordered categoricals + expected["last_status"] = expected["last_status"].astype(delivery_status_type) + tm.assert_frame_equal(result, expected)