From c2fae931e58ee5433ab3997133d60025327419a2 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 4 Feb 2023 20:02:53 -0800 Subject: [PATCH 1/6] REF: avoid handling corner cases in op_via_apply --- pandas/core/groupby/generic.py | 30 ++++++++++++++++++------------ pandas/core/groupby/groupby.py | 6 ------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 815f9936057f4..0501ead7c7928 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -380,10 +380,16 @@ def _wrap_applied_output( """ if len(values) == 0: # GH #6265 + if is_transform: + # GH#47787 see test_group_on_empty_multiindex + res_index = data.index + else: + res_index = self.grouper.result_index + return self.obj._constructor( [], name=self.obj.name, - index=self.grouper.result_index, + index=res_index, dtype=data.dtype, ) assert values is not None @@ -1136,14 +1142,12 @@ def cov( @property @doc(Series.is_monotonic_increasing.__doc__) def is_monotonic_increasing(self) -> Series: - result = self._op_via_apply("is_monotonic_increasing") - return result + return self.apply(lambda ser: ser.is_monotonic_increasing) @property @doc(Series.is_monotonic_decreasing.__doc__) def is_monotonic_decreasing(self) -> Series: - result = self._op_via_apply("is_monotonic_decreasing") - return result + return self.apply(lambda ser: ser.is_monotonic_decreasing) @doc(Series.hist.__doc__) def hist( @@ -1181,8 +1185,7 @@ def hist( @property @doc(Series.dtype.__doc__) def dtype(self) -> Series: - result = self._op_via_apply("dtype") - return result + return self.apply(lambda ser: ser.dtype) @doc(Series.unique.__doc__) def unique(self) -> Series: @@ -1428,9 +1431,13 @@ def _wrap_applied_output( ): if len(values) == 0: - result = self.obj._constructor( - index=self.grouper.result_index, columns=data.columns - ) + if is_transform: + # GH#47787 see test_group_on_empty_multiindex + res_index = data.index + else: + res_index = self.grouper.result_index + + result = self.obj._constructor(index=res_index, columns=data.columns) result = result.astype(data.dtypes, copy=False) return result @@ -2677,8 +2684,7 @@ def hist( @property @doc(DataFrame.dtypes.__doc__) def dtypes(self) -> Series: - result = self._op_via_apply("dtypes") - return result + return self.apply(lambda df: df.dtypes) @doc(DataFrame.corrwith.__doc__) def corrwith( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fd9a06a06cfa7..d4c2013cc578e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -956,9 +956,6 @@ def __getattr__(self, attr: str): def _op_via_apply(self, name: str, *args, **kwargs): """Compute the result of an operation by using GroupBy's apply.""" f = getattr(type(self._obj_with_exclusions), name) - if not callable(f): - return self.apply(lambda self: getattr(self, name)) - sig = inspect.signature(f) # a little trickery for aggregation functions that need an axis @@ -980,9 +977,6 @@ def curried(x): return self.apply(curried) is_transform = name in base.transformation_kernels - # Transform needs to keep the same schema, including when empty - if is_transform and self._obj_with_exclusions.empty: - return self._obj_with_exclusions result = self._python_apply_general( curried, self._obj_with_exclusions, From 0ee4baea8946c5c3066e804e9ba079020a046d50 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 5 Feb 2023 11:48:02 -0800 Subject: [PATCH 2/6] simplify _wrap_aggregated_output --- pandas/core/groupby/groupby.py | 24 ++++++++++-------------- pandas/core/groupby/ops.py | 2 -- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d4c2013cc578e..c5923715b2114 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1125,7 +1125,7 @@ def _indexed_output_to_ndframe( @final def _wrap_aggregated_output( self, - output: Series | DataFrame | Mapping[base.OutputKey, ArrayLike], + result: Series | DataFrame, qs: npt.NDArray[np.float64] | None = None, ): """ @@ -1133,22 +1133,14 @@ def _wrap_aggregated_output( Parameters ---------- - output : Series, DataFrame, or Mapping[base.OutputKey, ArrayLike] - Data to wrap. + result : Series, DataFrame Returns ------- Series or DataFrame """ - - if isinstance(output, (Series, DataFrame)): - # We get here (for DataFrameGroupBy) if we used Manager.grouped_reduce, - # in which case our columns are already set correctly. - # ATM we do not get here for SeriesGroupBy; when we do, we will - # need to require that result.name already match self.obj.name - result = output - else: - result = self._indexed_output_to_ndframe(output) + # ATM we do not get here for SeriesGroupBy; when we do, we will + # need to require that result.name already match self.obj.name if not self.as_index: # `not self.as_index` is only relevant for DataFrameGroupBy, @@ -1450,7 +1442,8 @@ def _python_agg_general(self, func, *args, **kwargs): output: dict[base.OutputKey, ArrayLike] = {} if self.ngroups == 0: - # agg_series below assumes ngroups > 0 + # e.g. test_evaluate_with_empty_groups different path gets different + # result dtype in empty case. return self._python_apply_general(f, self._selected_obj, is_agg=True) for idx, obj in enumerate(self._iterate_slices()): @@ -1460,9 +1453,11 @@ def _python_agg_general(self, func, *args, **kwargs): output[key] = result if not output: + # e.g. test_groupby_crash_on_nunique, test_margins_no_values_no_cols return self._python_apply_general(f, self._selected_obj) - return self._wrap_aggregated_output(output) + result = self._indexed_output_to_ndframe(output) + return self._wrap_aggregated_output(result) @final def _agg_general( @@ -2571,6 +2566,7 @@ def ohlc(self) -> DataFrame: ) return self._reindex_output(result) + # TODO: 2023-02-05 all tests that get here have self.as_index return self._apply_to_column_groupbys( lambda x: x.ohlc(), self._obj_with_exclusions ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index bff61ec135d74..14ca9066dae77 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1028,7 +1028,6 @@ def _aggregate_series_pure_python( ) -> npt.NDArray[np.object_]: ids, _, ngroups = self.group_info - counts = np.zeros(ngroups, dtype=int) result = np.empty(ngroups, dtype="O") initialized = False @@ -1044,7 +1043,6 @@ def _aggregate_series_pure_python( libreduction.check_result_array(res, group.dtype) initialized = True - counts[i] = group.shape[0] result[i] = res return result From 1b9a2891f5f7e22a8775d0eaaac08dae84ee96cd Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 5 Feb 2023 16:17:42 -0800 Subject: [PATCH 3/6] REF: remove _wrap_transformed_output --- pandas/core/groupby/generic.py | 7 +----- pandas/core/groupby/groupby.py | 39 +++++++--------------------------- pandas/core/resample.py | 1 + 3 files changed, 10 insertions(+), 37 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0501ead7c7928..fe21e27bbce5c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1731,13 +1731,8 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: output[i] = sgb.transform(wrapper) inds.append(i) - if not output: - raise TypeError("Transform function invalid for data types") - - columns = obj.columns.take(inds) - result = self.obj._constructor(output, index=obj.index) - result.columns = columns + result.columns = obj.columns return result def filter(self, func, dropna: bool = True, *args, **kwargs): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c5923715b2114..d9bb2844d798d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1169,36 +1169,6 @@ def _wrap_aggregated_output( return self._reindex_output(result, qs=qs) - @final - def _wrap_transformed_output( - self, output: Mapping[base.OutputKey, ArrayLike] - ) -> Series | DataFrame: - """ - Wraps the output of GroupBy transformations into the expected result. - - Parameters - ---------- - output : Mapping[base.OutputKey, ArrayLike] - Data to wrap. - - Returns - ------- - Series or DataFrame - Series for SeriesGroupBy, DataFrame for DataFrameGroupBy - """ - if isinstance(output, (Series, DataFrame)): - result = output - else: - result = self._indexed_output_to_ndframe(output) - - if self.axis == 1: - # Only relevant for DataFrameGroupBy - result = result.T - result.columns = self.obj.columns - - result.index = self.obj.index - return result - def _wrap_applied_output( self, data, @@ -1839,6 +1809,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: # If we are grouping on categoricals we want unobserved categories to # return zero, rather than the default of NaN which the reindexing in # _wrap_agged_manager() returns. GH 35028 + # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false with com.temp_setattr(self, "observed", True): result = self._wrap_agged_manager(new_mgr) @@ -2844,7 +2815,13 @@ def blk_func(values: ArrayLike) -> ArrayLike: if isinstance(new_obj, Series): new_obj.name = obj.name - return self._wrap_transformed_output(new_obj) + if self.axis == 1: + # Only relevant for DataFrameGroupBy + new_obj = new_obj.T + new_obj.columns = self.obj.columns + + new_obj.index = self.obj.index + return new_obj @final @Substitution(name="groupby") diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 0cb2765b439bc..01f0ddd1627c7 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -435,6 +435,7 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): try: if isinstance(obj, ABCDataFrame) and callable(how): # Check if the function is reducing or not. + # e.g. test_resample_apply_with_additional_args result = grouped._aggregate_item_by_item(how, *args, **kwargs) else: result = grouped.aggregate(how, *args, **kwargs) From fca5cc354b8578d4394904b92bd178b87ade9737 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 5 Feb 2023 16:41:39 -0800 Subject: [PATCH 4/6] final --- pandas/core/groupby/groupby.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d9bb2844d798d..04a0840f8adb4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1099,6 +1099,7 @@ def _set_result_index_ordered( return result + @final def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame: if isinstance(result, Series): result = result.to_frame() From cb499e9e450fd18dfc2b661e9f6aaab5580d8513 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 5 Feb 2023 18:41:45 -0800 Subject: [PATCH 5/6] mypy fixup --- pandas/core/groupby/generic.py | 3 ++- pandas/core/groupby/groupby.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index fe21e27bbce5c..20a1cb6e5ee3c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2679,7 +2679,8 @@ def hist( @property @doc(DataFrame.dtypes.__doc__) def dtypes(self) -> Series: - return self.apply(lambda df: df.dtypes) + # error: Incompatible return value type (got "DataFrame", expected "Series") + return self.apply(lambda df: df.dtypes) # type: ignore[return-value] @doc(DataFrame.corrwith.__doc__) def corrwith( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 04a0840f8adb4..be22b05cbe1ab 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1427,8 +1427,8 @@ def _python_agg_general(self, func, *args, **kwargs): # e.g. test_groupby_crash_on_nunique, test_margins_no_values_no_cols return self._python_apply_general(f, self._selected_obj) - result = self._indexed_output_to_ndframe(output) - return self._wrap_aggregated_output(result) + res = self._indexed_output_to_ndframe(output) + return self._wrap_aggregated_output(res) @final def _agg_general( From 5c9d6ec0490191783421e2a87fa6d1a663d51873 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 5 Feb 2023 19:00:06 -0800 Subject: [PATCH 6/6] remove unnecessary --- pandas/core/groupby/generic.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 20a1cb6e5ee3c..aec2037d044b8 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1726,10 +1726,8 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: # iterate through columns, see test_transform_exclude_nuisance # gets here with non-unique columns output = {} - inds = [] for i, (colname, sgb) in enumerate(self._iterate_column_groupbys(obj)): output[i] = sgb.transform(wrapper) - inds.append(i) result = self.obj._constructor(output, index=obj.index) result.columns = obj.columns