REF: prune groupby paths (#51187)

jbrockmendel · web-flow · commit bae0bf0ef5a2 · 2023-02-06T10:45:46.000-08:00
* REF: avoid handling corner cases in op_via_apply

* simplify _wrap_aggregated_output

* REF: remove _wrap_transformed_output

* final

* mypy fixup

* remove unnecessary
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -390,10 +390,16 @@ def _wrap_applied_output(
         """
         if len(values) == 0:
             # GH #6265
+            if is_transform:
+                # GH#47787 see test_group_on_empty_multiindex
+                res_index = data.index
+            else:
+                res_index = self.grouper.result_index
+
             return self.obj._constructor(
                 [],
                 name=self.obj.name,
-                index=self.grouper.result_index,
+                index=res_index,
                 dtype=data.dtype,
             )
         assert values is not None
@@ -1146,14 +1152,12 @@ def cov(
     @property
     @doc(Series.is_monotonic_increasing.__doc__)
     def is_monotonic_increasing(self) -> Series:
-        result = self._op_via_apply("is_monotonic_increasing")
-        return result
+        return self.apply(lambda ser: ser.is_monotonic_increasing)
 
     @property
     @doc(Series.is_monotonic_decreasing.__doc__)
     def is_monotonic_decreasing(self) -> Series:
-        result = self._op_via_apply("is_monotonic_decreasing")
-        return result
+        return self.apply(lambda ser: ser.is_monotonic_decreasing)
 
     @doc(Series.hist.__doc__)
     def hist(
@@ -1191,8 +1195,7 @@ def hist(
     @property
     @doc(Series.dtype.__doc__)
     def dtype(self) -> Series:
-        result = self._op_via_apply("dtype")
-        return result
+        return self.apply(lambda ser: ser.dtype)
 
     @doc(Series.unique.__doc__)
     def unique(self) -> Series:
@@ -1438,9 +1441,13 @@ def _wrap_applied_output(
     ):
 
         if len(values) == 0:
-            result = self.obj._constructor(
-                index=self.grouper.result_index, columns=data.columns
-            )
+            if is_transform:
+                # GH#47787 see test_group_on_empty_multiindex
+                res_index = data.index
+            else:
+                res_index = self.grouper.result_index
+
+            result = self.obj._constructor(index=res_index, columns=data.columns)
             result = result.astype(data.dtypes, copy=False)
             return result
 
@@ -1729,18 +1736,11 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
         # iterate through columns, see test_transform_exclude_nuisance
         #  gets here with non-unique columns
         output = {}
-        inds = []
         for i, (colname, sgb) in enumerate(self._iterate_column_groupbys(obj)):
             output[i] = sgb.transform(wrapper)
-            inds.append(i)
-
-        if not output:
-            raise TypeError("Transform function invalid for data types")
-
-        columns = obj.columns.take(inds)
 
         result = self.obj._constructor(output, index=obj.index)
-        result.columns = columns
+        result.columns = obj.columns
         return result
 
     def filter(self, func, dropna: bool = True, *args, **kwargs):
@@ -2693,8 +2693,8 @@ def hist(
     @property
     @doc(DataFrame.dtypes.__doc__)
     def dtypes(self) -> Series:
-        result = self._op_via_apply("dtypes")
-        return result
+        # error: Incompatible return value type (got "DataFrame", expected "Series")
+        return self.apply(lambda df: df.dtypes)  # type: ignore[return-value]
 
     @doc(DataFrame.corrwith.__doc__)
     def corrwith(
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -956,9 +956,6 @@ def __getattr__(self, attr: str):
     def _op_via_apply(self, name: str, *args, **kwargs):
         """Compute the result of an operation by using GroupBy's apply."""
         f = getattr(type(self._obj_with_exclusions), name)
-        if not callable(f):
-            return self.apply(lambda self: getattr(self, name))
-
         sig = inspect.signature(f)
 
         # a little trickery for aggregation functions that need an axis
@@ -980,9 +977,6 @@ def curried(x):
             return self.apply(curried)
 
         is_transform = name in base.transformation_kernels
-        # Transform needs to keep the same schema, including when empty
-        if is_transform and self._obj_with_exclusions.empty:
-            return self._obj_with_exclusions
         result = self._python_apply_general(
             curried,
             self._obj_with_exclusions,
@@ -1105,6 +1099,7 @@ def _set_result_index_ordered(
 
         return result
 
+    @final
     def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame:
         if isinstance(result, Series):
             result = result.to_frame()
@@ -1131,30 +1126,22 @@ def _indexed_output_to_ndframe(
     @final
     def _wrap_aggregated_output(
         self,
-        output: Series | DataFrame | Mapping[base.OutputKey, ArrayLike],
+        result: Series | DataFrame,
         qs: npt.NDArray[np.float64] | None = None,
     ):
         """
         Wraps the output of GroupBy aggregations into the expected result.
 
         Parameters
         ----------
-        output : Series, DataFrame, or Mapping[base.OutputKey, ArrayLike]
-           Data to wrap.
+        result : Series, DataFrame
 
         Returns
         -------
         Series or DataFrame
         """
-
-        if isinstance(output, (Series, DataFrame)):
-            # We get here (for DataFrameGroupBy) if we used Manager.grouped_reduce,
-            #  in which case our columns are already set correctly.
-            # ATM we do not get here for SeriesGroupBy; when we do, we will
-            #  need to require that result.name already match self.obj.name
-            result = output
-        else:
-            result = self._indexed_output_to_ndframe(output)
+        # ATM we do not get here for SeriesGroupBy; when we do, we will
+        #  need to require that result.name already match self.obj.name
 
         if not self.as_index:
             # `not self.as_index` is only relevant for DataFrameGroupBy,
@@ -1183,36 +1170,6 @@ def _wrap_aggregated_output(
 
         return self._reindex_output(result, qs=qs)
 
-    @final
-    def _wrap_transformed_output(
-        self, output: Mapping[base.OutputKey, ArrayLike]
-    ) -> Series | DataFrame:
-        """
-        Wraps the output of GroupBy transformations into the expected result.
-
-        Parameters
-        ----------
-        output : Mapping[base.OutputKey, ArrayLike]
-            Data to wrap.
-
-        Returns
-        -------
-        Series or DataFrame
-            Series for SeriesGroupBy, DataFrame for DataFrameGroupBy
-        """
-        if isinstance(output, (Series, DataFrame)):
-            result = output
-        else:
-            result = self._indexed_output_to_ndframe(output)
-
-        if self.axis == 1:
-            # Only relevant for DataFrameGroupBy
-            result = result.T
-            result.columns = self.obj.columns
-
-        result.index = self.obj.index
-        return result
-
     def _wrap_applied_output(
         self,
         data,
@@ -1456,7 +1413,8 @@ def _python_agg_general(self, func, *args, **kwargs):
         output: dict[base.OutputKey, ArrayLike] = {}
 
         if self.ngroups == 0:
-            # agg_series below assumes ngroups > 0
+            # e.g. test_evaluate_with_empty_groups different path gets different
+            #  result dtype in empty case.
             return self._python_apply_general(f, self._selected_obj, is_agg=True)
 
         for idx, obj in enumerate(self._iterate_slices()):
@@ -1466,9 +1424,11 @@ def _python_agg_general(self, func, *args, **kwargs):
             output[key] = result
 
         if not output:
+            # e.g. test_groupby_crash_on_nunique, test_margins_no_values_no_cols
             return self._python_apply_general(f, self._selected_obj)
 
-        return self._wrap_aggregated_output(output)
+        res = self._indexed_output_to_ndframe(output)
+        return self._wrap_aggregated_output(res)
 
     @final
     def _agg_general(
@@ -1837,6 +1797,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
         # If we are grouping on categoricals we want unobserved categories to
         # return zero, rather than the default of NaN which the reindexing in
         # _wrap_agged_manager() returns. GH 35028
+        # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false
         with com.temp_setattr(self, "observed", True):
             result = self._wrap_agged_manager(new_mgr)
 
@@ -2555,6 +2516,7 @@ def ohlc(self) -> DataFrame:
             )
             return self._reindex_output(result)
 
+        # TODO: 2023-02-05 all tests that get here have self.as_index
         return self._apply_to_column_groupbys(
             lambda x: x.ohlc(), self._obj_with_exclusions
         )
@@ -2832,7 +2794,13 @@ def blk_func(values: ArrayLike) -> ArrayLike:
         if isinstance(new_obj, Series):
             new_obj.name = obj.name
 
-        return self._wrap_transformed_output(new_obj)
+        if self.axis == 1:
+            # Only relevant for DataFrameGroupBy
+            new_obj = new_obj.T
+            new_obj.columns = self.obj.columns
+
+        new_obj.index = self.obj.index
+        return new_obj
 
     @final
     @Substitution(name="groupby")
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -1028,7 +1028,6 @@ def _aggregate_series_pure_python(
     ) -> npt.NDArray[np.object_]:
         ids, _, ngroups = self.group_info
 
-        counts = np.zeros(ngroups, dtype=int)
         result = np.empty(ngroups, dtype="O")
         initialized = False
 
@@ -1044,7 +1043,6 @@ def _aggregate_series_pure_python(
                 libreduction.check_result_array(res, group.dtype)
                 initialized = True
 
-            counts[i] = group.shape[0]
             result[i] = res
 
         return result
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
@@ -435,6 +435,7 @@ def _groupby_and_aggregate(self, how, *args, **kwargs):
         try:
             if isinstance(obj, ABCDataFrame) and callable(how):
                 # Check if the function is reducing or not.
+                # e.g. test_resample_apply_with_additional_args
                 result = grouped._aggregate_item_by_item(how, *args, **kwargs)
             else:
                 result = grouped.aggregate(how, *args, **kwargs)