From 74fe8697c09a9e3b1d4811e5bba3971d4851ab72 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 15 Feb 2023 12:50:40 -0800 Subject: [PATCH 1/3] REF: simplify _apply_to_column_groupbys, rename apply->apply_groupwise --- pandas/core/groupby/generic.py | 24 ++++++++++-------------- pandas/core/groupby/groupby.py | 6 ++---- pandas/core/groupby/ops.py | 2 +- 3 files changed, 13 insertions(+), 19 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 19fba398feb08..59cd76b6629e3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1850,23 +1850,22 @@ def _indexed_output_to_ndframe( def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: return self.obj._constructor(mgr) - def _iterate_column_groupbys(self, obj: DataFrame): - for i, colname in enumerate(obj.columns): - yield colname, SeriesGroupBy( + def _apply_to_column_groupbys(self, func) -> DataFrame: + from pandas.core.reshape.concat import concat + + obj = self._obj_with_exclusions + columns = obj.columns + sgbs = [ + SeriesGroupBy( obj.iloc[:, i], selection=colname, grouper=self.grouper, exclusions=self.exclusions, observed=self.observed, ) - - def _apply_to_column_groupbys(self, func, obj: DataFrame) -> DataFrame: - from pandas.core.reshape.concat import concat - - columns = obj.columns - results = [ - func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj) + for i, colname in enumerate(obj.columns) ] + results = [func(sgb) for sgb in sgbs] if not len(results): # concat would raise @@ -1925,10 +1924,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True ) - obj = self._obj_with_exclusions - results = self._apply_to_column_groupbys( - lambda sgb: sgb.nunique(dropna), obj=obj - ) + results = self._apply_to_column_groupbys(lambda sgb: sgb.nunique(dropna)) if not self.as_index: results.index = default_index(len(results)) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index dee68c01587b1..8431aa820d296 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1398,7 +1398,7 @@ def _python_apply_general( Series or DataFrame data after applying f """ - values, mutated = self.grouper.apply(f, data, self.axis) + values, mutated = self.grouper.apply_groupwise(f, data, self.axis) if not_indexed_same is None: not_indexed_same = mutated @@ -2476,9 +2476,7 @@ def ohlc(self) -> DataFrame: ) return self._reindex_output(result) - result = self._apply_to_column_groupbys( - lambda x: x.ohlc(), self._obj_with_exclusions - ) + result = self._apply_to_column_groupbys(lambda sgb: sgb.ohlc()) if not self.as_index: result = self._insert_inaxis_grouper(result) result.index = default_index(len(result)) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 52b8301554c96..726d75d705344 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -748,7 +748,7 @@ def group_keys_seq(self): return get_flattened_list(ids, ngroups, self.levels, self.codes) @final - def apply( + def apply_groupwise( self, f: Callable, data: DataFrame | Series, axis: AxisInt = 0 ) -> tuple[list, bool]: mutated = False From 755876852a5e50d4b1178d0370d33cd987db201e Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 15 Feb 2023 18:11:02 -0800 Subject: [PATCH 2/3] REF: consolidate self.as_index check --- pandas/core/groupby/generic.py | 17 ++++++++--------- pandas/core/groupby/groupby.py | 3 --- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 59cd76b6629e3..1d620564b4e49 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1869,9 +1869,14 @@ def _apply_to_column_groupbys(self, func) -> DataFrame: if not len(results): # concat would raise - return DataFrame([], columns=columns, index=self.grouper.result_index) + res_df = DataFrame([], columns=columns, index=self.grouper.result_index) else: - return concat(results, keys=columns, axis=1) + res_df = concat(results, keys=columns, axis=1) + + if not self.as_index: + res_df.index = default_index(len(res_df)) + res_df = self._insert_inaxis_grouper(res_df) + return res_df def nunique(self, dropna: bool = True) -> DataFrame: """ @@ -1924,13 +1929,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True ) - results = self._apply_to_column_groupbys(lambda sgb: sgb.nunique(dropna)) - - if not self.as_index: - results.index = default_index(len(results)) - results = self._insert_inaxis_grouper(results) - - return results + return self._apply_to_column_groupbys(lambda sgb: sgb.nunique(dropna)) def idxmax( self, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8431aa820d296..c199290b8caec 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2477,9 +2477,6 @@ def ohlc(self) -> DataFrame: return self._reindex_output(result) result = self._apply_to_column_groupbys(lambda sgb: sgb.ohlc()) - if not self.as_index: - result = self._insert_inaxis_grouper(result) - result.index = default_index(len(result)) return result @doc(DataFrame.describe) From 9baf68ffae410c6df99855c0b5cd24d28fd8b7ef Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 20 Feb 2023 13:39:29 -0800 Subject: [PATCH 3/3] REF: remove unused _iterate_slices --- pandas/core/groupby/generic.py | 13 ++++++++----- pandas/core/groupby/groupby.py | 8 -------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1d620564b4e49..67188d91bca70 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -154,9 +154,6 @@ def _get_data_to_aggregate( ) return single - def _iterate_slices(self) -> Iterable[Series]: - yield self._selected_obj - _agg_examples_doc = dedent( """ Examples @@ -408,7 +405,9 @@ def _aggregate_named(self, func, *args, **kwargs): result = {} initialized = False - for name, group in self: + for name, group in self.grouper.get_iterator( + self._selected_obj, axis=self.axis + ): object.__setattr__(group, "name", name) output = func(group, *args, **kwargs) @@ -568,7 +567,11 @@ def true_and_notna(x) -> bool: try: indices = [ - self._get_index(name) for name, group in self if true_and_notna(group) + self._get_index(name) + for name, group in self.grouper.get_iterator( + self._selected_obj, axis=self.axis + ) + if true_and_notna(group) ] except (ValueError, TypeError) as err: raise TypeError("the filter must return a boolean result") from err diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c199290b8caec..43354fc589760 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -19,7 +19,6 @@ class providing the base-class of operations. TYPE_CHECKING, Callable, Hashable, - Iterable, Iterator, List, Literal, @@ -990,12 +989,6 @@ def curried(x): result = self._set_result_index_ordered(result) return result - # ----------------------------------------------------------------- - # Selection - - def _iterate_slices(self) -> Iterable[Series]: - raise AbstractMethodError(self) - # ----------------------------------------------------------------- # Dispatch/Wrapping @@ -2459,7 +2452,6 @@ def ohlc(self) -> DataFrame: Open, high, low and close values within each group. """ if self.obj.ndim == 1: - # self._iterate_slices() yields only self._selected_obj obj = self._selected_obj is_numeric = is_numeric_dtype(obj.dtype)