Skip to content

REF: Share py_fallback #41289

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 10 additions & 79 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@
validate_func_kwargs,
)
from pandas.core.apply import GroupByApply
from pandas.core.arrays import Categorical
from pandas.core.base import (
DataError,
SpecificationError,
Expand All @@ -84,7 +83,6 @@
_agg_template,
_apply_docs,
_transform_template,
get_groupby,
group_selection_context,
)
from pandas.core.indexes.api import (
Expand Down Expand Up @@ -353,6 +351,7 @@ def _cython_agg_general(

obj = self._selected_obj
objvals = obj._values
data = obj._mgr

if numeric_only and not is_numeric_dtype(obj.dtype):
raise DataError("No numeric types to aggregate")
Expand All @@ -362,28 +361,15 @@ def _cython_agg_general(
def array_func(values: ArrayLike) -> ArrayLike:
try:
result = self.grouper._cython_operation(
"aggregate", values, how, axis=0, min_count=min_count
"aggregate", values, how, axis=data.ndim - 1, min_count=min_count
)
except NotImplementedError:
ser = Series(values) # equiv 'obj' from outer frame
if self.ngroups > 0:
res_values, _ = self.grouper.agg_series(ser, alt)
else:
# equiv: res_values = self._python_agg_general(alt)
# error: Incompatible types in assignment (expression has
# type "Union[DataFrame, Series]", variable has type
# "Union[ExtensionArray, ndarray]")
res_values = self._python_apply_general( # type: ignore[assignment]
alt, ser
)
# generally if we have numeric_only=False
# and non-applicable functions
# try to python agg
# TODO: shouldn't min_count matter?
result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)

if isinstance(values, Categorical):
# Because we only get here with known dtype-preserving
# reductions, we cast back to Categorical.
# TODO: if we ever get "rank" working, exclude it here.
result = type(values)._from_sequence(res_values, dtype=values.dtype)
else:
result = res_values
return result

result = array_func(objvals)
Expand Down Expand Up @@ -1115,72 +1101,17 @@ def _cython_agg_general(
if numeric_only:
data = data.get_numeric_data(copy=False)

def cast_agg_result(result: ArrayLike, values: ArrayLike) -> ArrayLike:
# see if we can cast the values to the desired dtype
# this may not be the original dtype

if isinstance(result.dtype, np.dtype) and result.ndim == 1:
# We went through a SeriesGroupByPath and need to reshape
# GH#32223 includes case with IntegerArray values
# We only get here with values.dtype == object
result = result.reshape(1, -1)
# test_groupby_duplicate_columns gets here with
# result.dtype == int64, values.dtype=object, how="min"

return result

def py_fallback(values: ArrayLike) -> ArrayLike:
# if self.grouper.aggregate fails, we fall back to a pure-python
# solution

# We get here with a) EADtypes and b) object dtype
obj: FrameOrSeriesUnion

# call our grouper again with only this block
if values.ndim == 1:
# We only get here with ExtensionArray

obj = Series(values)
else:
# We only get here with values.dtype == object
# TODO special case not needed with ArrayManager
df = DataFrame(values.T)
# bc we split object blocks in grouped_reduce, we have only 1 col
# otherwise we'd have to worry about block-splitting GH#39329
assert df.shape[1] == 1
# Avoid call to self.values that can occur in DataFrame
# reductions; see GH#28949
obj = df.iloc[:, 0]

# Create SeriesGroupBy with observed=True so that it does
# not try to add missing categories if grouping over multiple
# Categoricals. This will done by later self._reindex_output()
# Doing it here creates an error. See GH#34951
sgb = get_groupby(obj, self.grouper, observed=True)

# Note: bc obj is always a Series here, we can ignore axis and pass
# `alt` directly instead of `lambda x: alt(x, axis=self.axis)`
# use _agg_general bc it will go through _cython_agg_general
# which will correctly cast Categoricals.
res_ser = sgb._agg_general(
numeric_only=False, min_count=min_count, alias=how, npfunc=alt
)

# unwrap Series to get array
res_values = res_ser._mgr.arrays[0]
return cast_agg_result(res_values, values)

def array_func(values: ArrayLike) -> ArrayLike:

try:
result = self.grouper._cython_operation(
"aggregate", values, how, axis=1, min_count=min_count
"aggregate", values, how, axis=data.ndim - 1, min_count=min_count
)
except NotImplementedError:
# generally if we have numeric_only=False
# and non-applicable functions
# try to python agg
result = py_fallback(values)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this allow py_fallback to be removed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not completely, just defined in one place instead of 2

# TODO: shouldn't min_count matter?
result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)

return result

Expand Down
49 changes: 49 additions & 0 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ class providing the base-class of operations.
Index,
MultiIndex,
)
from pandas.core.internals.blocks import ensure_block_shape
from pandas.core.series import Series
from pandas.core.sorting import get_group_index_sorter
from pandas.core.util.numba_ import NUMBA_FUNC_CACHE
Expand Down Expand Up @@ -1313,6 +1314,54 @@ def _agg_general(
result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
return result.__finalize__(self.obj, method="groupby")

def _agg_py_fallback(
self, values: ArrayLike, ndim: int, alt: Callable
) -> ArrayLike:
"""
Fallback to pure-python aggregation if _cython_operation raises
NotImplementedError.
"""
# We get here with a) EADtypes and b) object dtype

if values.ndim == 1:
# For DataFrameGroupBy we only get here with ExtensionArray
ser = Series(values)
else:
# We only get here with values.dtype == object
# TODO: special case not needed with ArrayManager
df = DataFrame(values.T)
# bc we split object blocks in grouped_reduce, we have only 1 col
# otherwise we'd have to worry about block-splitting GH#39329
assert df.shape[1] == 1
# Avoid call to self.values that can occur in DataFrame
# reductions; see GH#28949
ser = df.iloc[:, 0]

# Create SeriesGroupBy with observed=True so that it does
# not try to add missing categories if grouping over multiple
# Categoricals. This will done by later self._reindex_output()
# Doing it here creates an error. See GH#34951
sgb = get_groupby(ser, self.grouper, observed=True)
# For SeriesGroupBy we could just use self instead of sgb

if self.ngroups > 0:
res_values, _ = self.grouper.agg_series(ser, alt)
else:
# equiv: res_values = self._python_agg_general(alt)
res_values = sgb._python_apply_general(alt, ser)._values

if isinstance(values, Categorical):
# Because we only get here with known dtype-preserving
# reductions, we cast back to Categorical.
# TODO: if we ever get "rank" working, exclude it here.
res_values = type(values)._from_sequence(res_values, dtype=values.dtype)

# If we are DataFrameGroupBy and went through a SeriesGroupByPath
# then we need to reshape
# GH#32223 includes case with IntegerArray values, ndarray res_values
# test_groupby_duplicate_columns with object dtype values
return ensure_block_shape(res_values, ndim=ndim)

def _cython_agg_general(
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
):
Expand Down