Skip to content

REF: share _cython_agg_general #43762

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Sep 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 3 additions & 94 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
is_dict_like,
is_integer_dtype,
is_interval_dtype,
is_numeric_dtype,
is_scalar,
)
from pandas.core.dtypes.missing import (
Expand Down Expand Up @@ -78,6 +77,7 @@
_apply_docs,
_transform_template,
group_selection_context,
warn_dropping_nuisance_columns_deprecated,
)
from pandas.core.indexes.api import (
Index,
Expand Down Expand Up @@ -334,43 +334,6 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame:
output = self._reindex_output(output)
return output

def _cython_agg_general(
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
):

obj = self._selected_obj
objvals = obj._values
data = obj._mgr

if numeric_only and not is_numeric_dtype(obj.dtype):
# GH#41291 match Series behavior
raise NotImplementedError(
f"{type(self).__name__}.{how} does not implement numeric_only."
)

# This is overkill because it is only called once, but is here to
# mirror the array_func used in DataFrameGroupBy._cython_agg_general
def array_func(values: ArrayLike) -> ArrayLike:
try:
result = self.grouper._cython_operation(
"aggregate", values, how, axis=data.ndim - 1, min_count=min_count
)
except NotImplementedError:
# generally if we have numeric_only=False
# and non-applicable functions
# try to python agg
# TODO: shouldn't min_count matter?
result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)

return result

result = array_func(objvals)

ser = self.obj._constructor(
result, index=self.grouper.result_index, name=obj.name
)
return self._reindex_output(ser)

def _indexed_output_to_ndframe(
self, output: Mapping[base.OutputKey, ArrayLike]
) -> Series:
Expand Down Expand Up @@ -970,46 +933,6 @@ def _iterate_slices(self) -> Iterable[Series]:

yield values

def _cython_agg_general(
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
) -> DataFrame:
# Note: we never get here with how="ohlc"; that goes through SeriesGroupBy

data: Manager2D = self._get_data_to_aggregate()

if numeric_only:
data = data.get_numeric_data(copy=False)

def array_func(values: ArrayLike) -> ArrayLike:
try:
result = self.grouper._cython_operation(
"aggregate", values, how, axis=data.ndim - 1, min_count=min_count
)
except NotImplementedError:
# generally if we have numeric_only=False
# and non-applicable functions
# try to python agg
# TODO: shouldn't min_count matter?
result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)

return result

# TypeError -> we may have an exception in trying to aggregate
# continue and exclude the block
new_mgr = data.grouped_reduce(array_func, ignore_failures=True)

if len(new_mgr) < len(data):
warnings.warn(
f"Dropping invalid columns in {type(self).__name__}.{how} "
"is deprecated. In a future version, a TypeError will be raised. "
f"Before calling .{how}, select only columns which should be "
"valid for the function.",
FutureWarning,
stacklevel=4,
)

return self._wrap_agged_manager(new_mgr)

def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
if self.grouper.nkeys != 1:
raise AssertionError("Number of keys must be 1")
Expand Down Expand Up @@ -1195,14 +1118,7 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:
res_mgr.set_axis(1, mgr.axes[1])

if len(res_mgr) < len(mgr):
warnings.warn(
f"Dropping invalid columns in {type(self).__name__}.{how} "
"is deprecated. In a future version, a TypeError will be raised. "
f"Before calling .{how}, select only columns which should be "
"valid for the transforming function.",
FutureWarning,
stacklevel=4,
)
warn_dropping_nuisance_columns_deprecated(type(self), how)

res_df = self.obj._constructor(res_mgr)
if self.axis == 1:
Expand Down Expand Up @@ -1314,14 +1230,7 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
output[i] = sgb.transform(wrapper)
except TypeError:
# e.g. trying to call nanmean with string values
warnings.warn(
f"Dropping invalid columns in {type(self).__name__}.transform "
"is deprecated. In a future version, a TypeError will be raised. "
"Before calling .transform, select only columns which should be "
"valid for the transforming function.",
FutureWarning,
stacklevel=5,
)
warn_dropping_nuisance_columns_deprecated(type(self), "transform")
else:
inds.append(i)

Expand Down
111 changes: 73 additions & 38 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1422,14 +1422,7 @@ def _python_agg_general(self, func, *args, **kwargs):
# if this function is invalid for this dtype, we will ignore it.
result = self.grouper.agg_series(obj, f)
except TypeError:
warnings.warn(
f"Dropping invalid columns in {type(self).__name__}.agg "
"is deprecated. In a future version, a TypeError will be raised. "
"Before calling .agg, select only columns which should be "
"valid for the aggregating function.",
FutureWarning,
stacklevel=3,
)
warn_dropping_nuisance_columns_deprecated(type(self), "agg")
continue

key = base.OutputKey(label=name, position=idx)
Expand Down Expand Up @@ -1500,10 +1493,52 @@ def _agg_py_fallback(
# test_groupby_duplicate_columns with object dtype values
return ensure_block_shape(res_values, ndim=ndim)

@final
def _cython_agg_general(
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
):
raise AbstractMethodError(self)
# Note: we never get here with how="ohlc" for DataFrameGroupBy;
# that goes through SeriesGroupBy

data = self._get_data_to_aggregate()
is_ser = data.ndim == 1

if numeric_only:
if is_ser and not is_numeric_dtype(self._selected_obj.dtype):
# GH#41291 match Series behavior
raise NotImplementedError(
f"{type(self).__name__}.{how} does not implement numeric_only."
)
elif not is_ser:
data = data.get_numeric_data(copy=False)

def array_func(values: ArrayLike) -> ArrayLike:
try:
result = self.grouper._cython_operation(
"aggregate", values, how, axis=data.ndim - 1, min_count=min_count
)
except NotImplementedError:
# generally if we have numeric_only=False
# and non-applicable functions
# try to python agg
# TODO: shouldn't min_count matter?
result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)

return result

# TypeError -> we may have an exception in trying to aggregate
# continue and exclude the block
new_mgr = data.grouped_reduce(array_func, ignore_failures=True)

if not is_ser and len(new_mgr) < len(data):
warn_dropping_nuisance_columns_deprecated(type(self), how)

res = self._wrap_agged_manager(new_mgr)
if is_ser:
res.index = self.grouper.result_index
return self._reindex_output(res)
else:
return res

def _cython_transform(
self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
Expand Down Expand Up @@ -1769,8 +1804,9 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
# _wrap_agged_manager() returns. GH 35028
with com.temp_setattr(self, "observed", True):
result = self._wrap_agged_manager(new_mgr)
if result.ndim == 1:
result.index = self.grouper.result_index

if result.ndim == 1:
result.index = self.grouper.result_index

return self._reindex_output(result, fill_value=0)

Expand Down Expand Up @@ -2709,18 +2745,15 @@ def blk_func(values: ArrayLike) -> ArrayLike:

res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)
if not is_ser and len(res_mgr.items) != len(mgr.items):
warnings.warn(
"Dropping invalid columns in "
f"{type(self).__name__}.quantile is deprecated. "
"In a future version, a TypeError will be raised. "
"Before calling .quantile, select only columns which "
"should be valid for the function.",
FutureWarning,
stacklevel=find_stack_level(),
)
warn_dropping_nuisance_columns_deprecated(type(self), "quantile")

if len(res_mgr.items) == 0:
# re-call grouped_reduce to get the desired exception message
mgr.grouped_reduce(blk_func, ignore_failures=False)
# grouped_reduce _should_ raise, so this should not be reached
raise TypeError( # pragma: no cover
"All columns were dropped in grouped_reduce"
)

if is_ser:
res = self._wrap_agged_manager(res_mgr)
Expand Down Expand Up @@ -3150,30 +3183,20 @@ def blk_func(values: ArrayLike) -> ArrayLike:

if not is_ser and len(res_mgr.items) != len(mgr.items):
howstr = how.replace("group_", "")
warnings.warn(
"Dropping invalid columns in "
f"{type(self).__name__}.{howstr} is deprecated. "
"In a future version, a TypeError will be raised. "
f"Before calling .{howstr}, select only columns which "
"should be valid for the function.",
FutureWarning,
stacklevel=3,
)
warn_dropping_nuisance_columns_deprecated(type(self), howstr)

if len(res_mgr.items) == 0:
# We re-call grouped_reduce to get the right exception message
try:
mgr.grouped_reduce(blk_func, ignore_failures=False)
except Exception as err:
error_msg = str(err)
raise TypeError(error_msg)
# We should never get here
raise TypeError("All columns were dropped in grouped_reduce")
mgr.grouped_reduce(blk_func, ignore_failures=False)
# grouped_reduce _should_ raise, so this should not be reached
raise TypeError( # pragma: no cover
"All columns were dropped in grouped_reduce"
)

if is_ser:
out = self._wrap_agged_manager(res_mgr)
out.index = self.grouper.result_index
else:
out = type(obj)(res_mgr)
out = obj._constructor(res_mgr)

return self._wrap_aggregated_output(out)

Expand Down Expand Up @@ -3627,3 +3650,15 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde
else:
mi = MultiIndex.from_product([idx, qs])
return mi


def warn_dropping_nuisance_columns_deprecated(cls, how: str) -> None:
warnings.warn(
"Dropping invalid columns in "
f"{cls.__name__}.{how} is deprecated. "
"In a future version, a TypeError will be raised. "
f"Before calling .{how}, select only columns which "
"should be valid for the function.",
FutureWarning,
stacklevel=find_stack_level(),
)