diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4f60660dfb499..c04ad0e9dfa30 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -69,7 +69,6 @@ validate_func_kwargs, ) from pandas.core.apply import GroupByApply -from pandas.core.arrays import Categorical from pandas.core.base import ( DataError, SpecificationError, @@ -84,7 +83,6 @@ _agg_template, _apply_docs, _transform_template, - get_groupby, group_selection_context, ) from pandas.core.indexes.api import ( @@ -353,6 +351,7 @@ def _cython_agg_general( obj = self._selected_obj objvals = obj._values + data = obj._mgr if numeric_only and not is_numeric_dtype(obj.dtype): raise DataError("No numeric types to aggregate") @@ -362,28 +361,15 @@ def _cython_agg_general( def array_func(values: ArrayLike) -> ArrayLike: try: result = self.grouper._cython_operation( - "aggregate", values, how, axis=0, min_count=min_count + "aggregate", values, how, axis=data.ndim - 1, min_count=min_count ) except NotImplementedError: - ser = Series(values) # equiv 'obj' from outer frame - if self.ngroups > 0: - res_values, _ = self.grouper.agg_series(ser, alt) - else: - # equiv: res_values = self._python_agg_general(alt) - # error: Incompatible types in assignment (expression has - # type "Union[DataFrame, Series]", variable has type - # "Union[ExtensionArray, ndarray]") - res_values = self._python_apply_general( # type: ignore[assignment] - alt, ser - ) + # generally if we have numeric_only=False + # and non-applicable functions + # try to python agg + # TODO: shouldn't min_count matter? + result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) - if isinstance(values, Categorical): - # Because we only get here with known dtype-preserving - # reductions, we cast back to Categorical. - # TODO: if we ever get "rank" working, exclude it here. - result = type(values)._from_sequence(res_values, dtype=values.dtype) - else: - result = res_values return result result = array_func(objvals) @@ -1115,72 +1101,17 @@ def _cython_agg_general( if numeric_only: data = data.get_numeric_data(copy=False) - def cast_agg_result(result: ArrayLike, values: ArrayLike) -> ArrayLike: - # see if we can cast the values to the desired dtype - # this may not be the original dtype - - if isinstance(result.dtype, np.dtype) and result.ndim == 1: - # We went through a SeriesGroupByPath and need to reshape - # GH#32223 includes case with IntegerArray values - # We only get here with values.dtype == object - result = result.reshape(1, -1) - # test_groupby_duplicate_columns gets here with - # result.dtype == int64, values.dtype=object, how="min" - - return result - - def py_fallback(values: ArrayLike) -> ArrayLike: - # if self.grouper.aggregate fails, we fall back to a pure-python - # solution - - # We get here with a) EADtypes and b) object dtype - obj: FrameOrSeriesUnion - - # call our grouper again with only this block - if values.ndim == 1: - # We only get here with ExtensionArray - - obj = Series(values) - else: - # We only get here with values.dtype == object - # TODO special case not needed with ArrayManager - df = DataFrame(values.T) - # bc we split object blocks in grouped_reduce, we have only 1 col - # otherwise we'd have to worry about block-splitting GH#39329 - assert df.shape[1] == 1 - # Avoid call to self.values that can occur in DataFrame - # reductions; see GH#28949 - obj = df.iloc[:, 0] - - # Create SeriesGroupBy with observed=True so that it does - # not try to add missing categories if grouping over multiple - # Categoricals. This will done by later self._reindex_output() - # Doing it here creates an error. See GH#34951 - sgb = get_groupby(obj, self.grouper, observed=True) - - # Note: bc obj is always a Series here, we can ignore axis and pass - # `alt` directly instead of `lambda x: alt(x, axis=self.axis)` - # use _agg_general bc it will go through _cython_agg_general - # which will correctly cast Categoricals. - res_ser = sgb._agg_general( - numeric_only=False, min_count=min_count, alias=how, npfunc=alt - ) - - # unwrap Series to get array - res_values = res_ser._mgr.arrays[0] - return cast_agg_result(res_values, values) - def array_func(values: ArrayLike) -> ArrayLike: - try: result = self.grouper._cython_operation( - "aggregate", values, how, axis=1, min_count=min_count + "aggregate", values, how, axis=data.ndim - 1, min_count=min_count ) except NotImplementedError: # generally if we have numeric_only=False # and non-applicable functions # try to python agg - result = py_fallback(values) + # TODO: shouldn't min_count matter? + result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 620668dadc32d..5b2b00713b318 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -100,6 +100,7 @@ class providing the base-class of operations. Index, MultiIndex, ) +from pandas.core.internals.blocks import ensure_block_shape from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter from pandas.core.util.numba_ import NUMBA_FUNC_CACHE @@ -1313,6 +1314,54 @@ def _agg_general( result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) return result.__finalize__(self.obj, method="groupby") + def _agg_py_fallback( + self, values: ArrayLike, ndim: int, alt: Callable + ) -> ArrayLike: + """ + Fallback to pure-python aggregation if _cython_operation raises + NotImplementedError. + """ + # We get here with a) EADtypes and b) object dtype + + if values.ndim == 1: + # For DataFrameGroupBy we only get here with ExtensionArray + ser = Series(values) + else: + # We only get here with values.dtype == object + # TODO: special case not needed with ArrayManager + df = DataFrame(values.T) + # bc we split object blocks in grouped_reduce, we have only 1 col + # otherwise we'd have to worry about block-splitting GH#39329 + assert df.shape[1] == 1 + # Avoid call to self.values that can occur in DataFrame + # reductions; see GH#28949 + ser = df.iloc[:, 0] + + # Create SeriesGroupBy with observed=True so that it does + # not try to add missing categories if grouping over multiple + # Categoricals. This will done by later self._reindex_output() + # Doing it here creates an error. See GH#34951 + sgb = get_groupby(ser, self.grouper, observed=True) + # For SeriesGroupBy we could just use self instead of sgb + + if self.ngroups > 0: + res_values, _ = self.grouper.agg_series(ser, alt) + else: + # equiv: res_values = self._python_agg_general(alt) + res_values = sgb._python_apply_general(alt, ser)._values + + if isinstance(values, Categorical): + # Because we only get here with known dtype-preserving + # reductions, we cast back to Categorical. + # TODO: if we ever get "rank" working, exclude it here. + res_values = type(values)._from_sequence(res_values, dtype=values.dtype) + + # If we are DataFrameGroupBy and went through a SeriesGroupByPath + # then we need to reshape + # GH#32223 includes case with IntegerArray values, ndarray res_values + # test_groupby_duplicate_columns with object dtype values + return ensure_block_shape(res_values, ndim=ndim) + def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ):