From 6db62fcb96c3a76caa49ce2eeaddeeb5036f2e72 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 24 Apr 2021 16:22:51 -0700 Subject: [PATCH 1/2] BUG: retain ordered Categorical dtype in SeriesGroupBy aggregations --- pandas/core/groupby/generic.py | 41 ++++++++++++++----- .../tests/groupby/aggregate/test_aggregate.py | 9 ++++ pandas/tests/groupby/test_categorical.py | 29 ++++++++++++- 3 files changed, 66 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9d6d2d698dfe5..ebc0b8f9d23ef 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -360,9 +360,26 @@ def _cython_agg_general( if numeric_only and not is_numeric: continue - result = self.grouper._cython_operation( - "aggregate", obj._values, how, axis=0, min_count=min_count - ) + objvals = obj._values + + if isinstance(objvals, Categorical): + if self.grouper.ngroups > 0: + # without special-casing, we would raise, then in fallback + # would eventually call agg_series but without re-casting + # to Categorical + # equiv: res_values, _ = self.grouper.agg_series(obj, alt) + res_values, _ = self.grouper._aggregate_series_pure_python(obj, alt) + else: + # equiv: res_values = self._python_agg_general(alt) + res_values = self._python_apply_general(alt, self._selected_obj) + + result = type(objvals)._from_sequence(res_values, dtype=objvals.dtype) + + else: + result = self.grouper._cython_operation( + "aggregate", obj._values, how, axis=0, min_count=min_count + ) + assert result.ndim == 1 key = base.OutputKey(label=name, position=idx) output[key] = result @@ -1138,13 +1155,7 @@ def cast_agg_result( # see if we can cast the values to the desired dtype # this may not be the original dtype - if isinstance(values, Categorical) and isinstance(result, np.ndarray): - # If the Categorical op didn't raise, it is dtype-preserving - # We get here with how="first", "last", "min", "max" - result = type(values)._from_sequence(result.ravel(), dtype=values.dtype) - # Note this will have result.dtype == dtype from above - - elif ( + if ( not using_array_manager and isinstance(result, np.ndarray) and result.ndim == 1 @@ -1184,7 +1195,15 @@ def py_fallback(values: ArrayLike) -> ArrayLike: # Categoricals. This will done by later self._reindex_output() # Doing it here creates an error. See GH#34951 sgb = get_groupby(obj, self.grouper, observed=True) - result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) + if sgb.ndim == 2: + # equiv: result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) + result = sgb._python_agg_general(lambda x: alt(x, axis=self.axis)) + else: + # use _agg_general bc it will go through _cython_agg_general + # which will correctly cast Categoricals. + result = sgb._agg_general( + numeric_only=False, min_count=min_count, alias=how, npfunc=alt + ) assert isinstance(result, (Series, DataFrame)) # for mypy # In the case of object dtype block, it may have been split diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 28344897a686f..c61360e67e505 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1105,6 +1105,11 @@ def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): expected_df = DataFrame(data=exp_data, index=cat_index) + if "cat_ord" in expected_df: + # ordered categorical columns should be preserved + dtype = input_df["cat_ord"].dtype + expected_df["cat_ord"] = expected_df["cat_ord"].astype(dtype) + tm.assert_frame_equal(result_df, expected_df) @@ -1149,6 +1154,10 @@ def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): multi_index = MultiIndex.from_tuples(tuple(multi_index_list)) expected_df = DataFrame(data=exp_data, columns=multi_index, index=cat_index) + for col in expected_df.columns: + if isinstance(col, tuple) and "cat_ord" in col: + # ordered categorical should be preserved + expected_df[col] = expected_df[col].astype(input_df["cat_ord"].dtype) tm.assert_frame_equal(result_df, expected_df) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index da438826a939a..0509b3e165bcb 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -800,6 +800,12 @@ def test_preserve_on_ordered_ops(func, values): ).set_index("payload") tm.assert_frame_equal(result, expected) + # we should also preserve categorical for SeriesGroupBy + sgb = df.groupby("payload")["col"] + result = getattr(sgb, func)() + expected = expected["col"] + tm.assert_series_equal(result, expected) + def test_categorical_no_compress(): data = Series(np.random.randn(9)) @@ -1494,7 +1500,11 @@ def test_groupy_first_returned_categorical_instead_of_dataframe(func): df = DataFrame({"A": [1997], "B": Series(["b"], dtype="category").cat.as_ordered()}) df_grouped = df.groupby("A")["B"] result = getattr(df_grouped, func)() - expected = Series(["b"], index=Index([1997], name="A"), name="B") + + # ordered categorical dtype should be preserved + expected = Series( + ["b"], index=Index([1997], name="A"), name="B", dtype=df["B"].dtype + ) tm.assert_series_equal(result, expected) @@ -1561,7 +1571,15 @@ def test_agg_cython_category_not_implemented_fallback(): df["col_cat"] = df["col_num"].astype("category") result = df.groupby("col_num").col_cat.first() - expected = Series([1, 2, 3], index=Index([1, 2, 3], name="col_num"), name="col_cat") + + # ordered categorical dtype should definitely be preserved; + # this is unordered, so is less-clear case (if anything, it should raise) + expected = Series( + [1, 2, 3], + index=Index([1, 2, 3], name="col_num"), + name="col_cat", + dtype=df["col_cat"].dtype, + ) tm.assert_series_equal(result, expected) result = df.groupby("col_num").agg({"col_cat": "first"}) @@ -1576,6 +1594,10 @@ def test_aggregate_categorical_lost_index(func: str): df = DataFrame({"A": [1997], "B": ds}) result = df.groupby("A").agg({"B": func}) expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A")) + + # ordered categorical dtype should be preserved + expected["B"] = expected["B"].astype(ds.dtype) + tm.assert_frame_equal(result, expected) @@ -1653,6 +1675,9 @@ def test_categorical_transform(): expected["status"] = expected["status"].astype(delivery_status_type) + # .transform(max) should preserve ordered categoricals + expected["last_status"] = expected["last_status"].astype(delivery_status_type) + tm.assert_frame_equal(result, expected) From 0addcb304c2afc266367b94d14a8120119b8523c Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Apr 2021 14:05:35 -0700 Subject: [PATCH 2/2] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 7189c6e68d53d..dff4d32969c18 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -850,6 +850,7 @@ Groupby/resample/rolling - Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` computing wrong result with nullable data types too large to roundtrip when casting to float (:issue:`37493`) - Bug in :meth:`DataFrame.rolling` returning mean zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`) - Bug in :meth:`DataFrame.rolling` returning sum not zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`) +- Bug in :meth:`SeriesGroupBy.agg` failing to retain ordered :class:`CategoricalDtype` on order-preserving aggregations (:issue:`41147`) Reshaping ^^^^^^^^^