Skip to content

BUG: retain ordered Categorical dtype in SeriesGroupBy aggregations #41147

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,7 @@ Groupby/resample/rolling
- Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` computing wrong result with nullable data types too large to roundtrip when casting to float (:issue:`37493`)
- Bug in :meth:`DataFrame.rolling` returning mean zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`)
- Bug in :meth:`DataFrame.rolling` returning sum not zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`)
- Bug in :meth:`SeriesGroupBy.agg` failing to retain ordered :class:`CategoricalDtype` on order-preserving aggregations (:issue:`41147`)
- Bug in :meth:`DataFrameGroupBy.min` and :meth:`DataFrameGroupBy.max` with multiple object-dtype columns and ``numeric_only=False`` incorrectly raising ``ValueError`` (:issue:41111`)

Reshaping
Expand Down
38 changes: 27 additions & 11 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,9 +358,26 @@ def _cython_agg_general(
if numeric_only and not is_numeric:
continue

result = self.grouper._cython_operation(
"aggregate", obj._values, how, axis=0, min_count=min_count
)
objvals = obj._values

if isinstance(objvals, Categorical):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it make sense to try to separate this out further upstream in the future, so that _cython_agg_general isn't calling pure-python methods?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally we'll get this case handled within WrappedCythonOp._ea_wrap_cython_operation before too long, but im still troubleshooting that

if self.grouper.ngroups > 0:
# without special-casing, we would raise, then in fallback
# would eventually call agg_series but without re-casting
# to Categorical
# equiv: res_values, _ = self.grouper.agg_series(obj, alt)
res_values, _ = self.grouper._aggregate_series_pure_python(obj, alt)
else:
# equiv: res_values = self._python_agg_general(alt)
res_values = self._python_apply_general(alt, self._selected_obj)

result = type(objvals)._from_sequence(res_values, dtype=objvals.dtype)

else:
result = self.grouper._cython_operation(
"aggregate", obj._values, how, axis=0, min_count=min_count
)

assert result.ndim == 1
key = base.OutputKey(label=name, position=idx)
output[key] = result
Expand Down Expand Up @@ -1092,13 +1109,7 @@ def cast_agg_result(result: ArrayLike, values: ArrayLike) -> ArrayLike:
# see if we can cast the values to the desired dtype
# this may not be the original dtype

if isinstance(values, Categorical) and isinstance(result, np.ndarray):
# If the Categorical op didn't raise, it is dtype-preserving
# We get here with how="first", "last", "min", "max"
result = type(values)._from_sequence(result.ravel(), dtype=values.dtype)
# Note this will have result.dtype == dtype from above

elif (
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in conjunction with #41086, cast_agg_result becomes a 1-liner and can be cleanly tacked onto the end of py_fallback

if (
not using_array_manager
and isinstance(result.dtype, np.dtype)
and result.ndim == 1
Expand Down Expand Up @@ -1140,9 +1151,14 @@ def py_fallback(values: ArrayLike) -> ArrayLike:
# Categoricals. This will done by later self._reindex_output()
# Doing it here creates an error. See GH#34951
sgb = get_groupby(obj, self.grouper, observed=True)

# Note: bc obj is always a Series here, we can ignore axis and pass
# `alt` directly instead of `lambda x: alt(x, axis=self.axis)`
res_ser = sgb.aggregate(alt) # this will go through sgb._python_agg_general
# use _agg_general bc it will go through _cython_agg_general
# which will correctly cast Categoricals.
res_ser = sgb._agg_general(
numeric_only=False, min_count=min_count, alias=how, npfunc=alt
)

# unwrap Series to get array
res_values = res_ser._mgr.arrays[0]
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1105,6 +1105,11 @@ def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data):

expected_df = DataFrame(data=exp_data, index=cat_index)

if "cat_ord" in expected_df:
# ordered categorical columns should be preserved
dtype = input_df["cat_ord"].dtype
expected_df["cat_ord"] = expected_df["cat_ord"].astype(dtype)

tm.assert_frame_equal(result_df, expected_df)


Expand Down Expand Up @@ -1149,6 +1154,10 @@ def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data):
multi_index = MultiIndex.from_tuples(tuple(multi_index_list))

expected_df = DataFrame(data=exp_data, columns=multi_index, index=cat_index)
for col in expected_df.columns:
if isinstance(col, tuple) and "cat_ord" in col:
# ordered categorical should be preserved
expected_df[col] = expected_df[col].astype(input_df["cat_ord"].dtype)

tm.assert_frame_equal(result_df, expected_df)

Expand Down
29 changes: 27 additions & 2 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,12 @@ def test_preserve_on_ordered_ops(func, values):
).set_index("payload")
tm.assert_frame_equal(result, expected)

# we should also preserve categorical for SeriesGroupBy
sgb = df.groupby("payload")["col"]
result = getattr(sgb, func)()
expected = expected["col"]
tm.assert_series_equal(result, expected)


def test_categorical_no_compress():
data = Series(np.random.randn(9))
Expand Down Expand Up @@ -1494,7 +1500,11 @@ def test_groupy_first_returned_categorical_instead_of_dataframe(func):
df = DataFrame({"A": [1997], "B": Series(["b"], dtype="category").cat.as_ordered()})
df_grouped = df.groupby("A")["B"]
result = getattr(df_grouped, func)()
expected = Series(["b"], index=Index([1997], name="A"), name="B")

# ordered categorical dtype should be preserved
expected = Series(
["b"], index=Index([1997], name="A"), name="B", dtype=df["B"].dtype
)
tm.assert_series_equal(result, expected)


Expand Down Expand Up @@ -1561,7 +1571,15 @@ def test_agg_cython_category_not_implemented_fallback():
df["col_cat"] = df["col_num"].astype("category")

result = df.groupby("col_num").col_cat.first()
expected = Series([1, 2, 3], index=Index([1, 2, 3], name="col_num"), name="col_cat")

# ordered categorical dtype should definitely be preserved;
# this is unordered, so is less-clear case (if anything, it should raise)
expected = Series(
[1, 2, 3],
index=Index([1, 2, 3], name="col_num"),
name="col_cat",
dtype=df["col_cat"].dtype,
)
tm.assert_series_equal(result, expected)

result = df.groupby("col_num").agg({"col_cat": "first"})
Expand All @@ -1576,6 +1594,10 @@ def test_aggregate_categorical_lost_index(func: str):
df = DataFrame({"A": [1997], "B": ds})
result = df.groupby("A").agg({"B": func})
expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A"))

# ordered categorical dtype should be preserved
expected["B"] = expected["B"].astype(ds.dtype)

tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -1653,6 +1675,9 @@ def test_categorical_transform():

expected["status"] = expected["status"].astype(delivery_status_type)

# .transform(max) should preserve ordered categoricals
expected["last_status"] = expected["last_status"].astype(delivery_status_type)

tm.assert_frame_equal(result, expected)


Expand Down