From 1b21a6dd734669409886ab3b795c851ba62b4af8 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 9 Sep 2021 10:29:17 -0700 Subject: [PATCH 1/2] REF: dont pass keys through wrap_applied_output --- pandas/core/groupby/generic.py | 23 +++++++++++------------ pandas/core/groupby/groupby.py | 14 +++++++------- pandas/core/groupby/ops.py | 13 +++++++++---- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7af32d70c00bc..4380756be65da 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -399,7 +399,6 @@ def _wrap_transformed_output( def _wrap_applied_output( self, data: Series, - keys: Index, values: list[Any] | None, not_indexed_same: bool = False, ) -> DataFrame | Series: @@ -410,8 +409,6 @@ def _wrap_applied_output( ---------- data : Series Input data for groupby operation. - keys : Index - Keys of groups that Series was grouped by. values : Optional[List[Any]] Applied output for each group. not_indexed_same : bool, default False @@ -421,6 +418,8 @@ def _wrap_applied_output( ------- DataFrame or Series """ + keys = self.grouper.group_keys_seq + if len(keys) == 0: # GH #6265 return self.obj._constructor( @@ -442,7 +441,7 @@ def _wrap_applied_output( res_ser.name = self.obj.name return res_ser elif isinstance(values[0], (Series, DataFrame)): - return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) + return self._concat_objects(values, not_indexed_same=not_indexed_same) else: # GH #6265 #24880 result = self.obj._constructor( @@ -1130,7 +1129,9 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: res_df.columns = obj.columns return res_df - def _wrap_applied_output(self, data, keys, values, not_indexed_same=False): + def _wrap_applied_output(self, data, values, not_indexed_same=False): + keys = self.grouper.group_keys_seq + if len(keys) == 0: result = self.obj._constructor( index=self.grouper.result_index, columns=data.columns @@ -1145,7 +1146,7 @@ def _wrap_applied_output(self, data, keys, values, not_indexed_same=False): # GH9684 - All values are None, return an empty frame. return self.obj._constructor() elif isinstance(first_not_none, DataFrame): - return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) + return self._concat_objects(values, not_indexed_same=not_indexed_same) key_index = self.grouper.result_index if self.as_index else None @@ -1173,12 +1174,11 @@ def _wrap_applied_output(self, data, keys, values, not_indexed_same=False): else: # values are Series return self._wrap_applied_output_series( - keys, values, not_indexed_same, first_not_none, key_index + values, not_indexed_same, first_not_none, key_index ) def _wrap_applied_output_series( self, - keys, values: list[Series], not_indexed_same: bool, first_not_none, @@ -1201,6 +1201,7 @@ def _wrap_applied_output_series( # assign the name to this series if singular_series: + keys = self.grouper.group_keys_seq values[0].name = keys[0] # GH2893 @@ -1209,9 +1210,7 @@ def _wrap_applied_output_series( # if any of the sub-series are not indexed the same # OR we don't have a multi-index and we have only a # single values - return self._concat_objects( - keys, values, not_indexed_same=not_indexed_same - ) + return self._concat_objects(values, not_indexed_same=not_indexed_same) # still a series # path added as of GH 5545 @@ -1222,7 +1221,7 @@ def _wrap_applied_output_series( if not all_indexed_same: # GH 8467 - return self._concat_objects(keys, values, not_indexed_same=True) + return self._concat_objects(values, not_indexed_same=True) # Combine values # vstack+constructor is faster than concat and handles MI-columns diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b3e5605d4a2d1..ac88da3f0d47e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -998,7 +998,7 @@ def _iterate_slices(self) -> Iterable[Series]: # Dispatch/Wrapping @final - def _concat_objects(self, keys, values, not_indexed_same: bool = False): + def _concat_objects(self, values, not_indexed_same: bool = False): from pandas.core.reshape.concat import concat def reset_identity(values): @@ -1035,7 +1035,7 @@ def reset_identity(values): if self.as_index: # possible MI return case - group_keys = keys + group_keys = self.grouper.group_keys_seq group_levels = self.grouper.levels group_names = self.grouper.names @@ -1146,7 +1146,7 @@ def _wrap_aggregated_output( def _wrap_transformed_output(self, output: Mapping[base.OutputKey, ArrayLike]): raise AbstractMethodError(self) - def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False): + def _wrap_applied_output(self, data, values, not_indexed_same: bool = False): raise AbstractMethodError(self) def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: @@ -1182,7 +1182,7 @@ def _group_keys_index(self) -> Index: # The index to use for the result of Groupby Aggregations. # This _may_ be redundant with self.grouper.result_index, but that # has not been conclusively proven yet. - keys = self.grouper._get_group_keys() + keys = self.grouper.group_keys_seq if self.grouper.nkeys > 1: index = MultiIndex.from_tuples(keys, names=self.grouper.names) else: @@ -1223,7 +1223,7 @@ def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) data and indices into a Numba jitted function. """ starts, ends, sorted_index, sorted_data = self._numba_prep(func, data) - group_keys = self.grouper._get_group_keys() + group_keys = self.grouper.group_keys_seq numba_transform_func = numba_.generate_numba_transform_func( kwargs, func, engine_kwargs @@ -1360,13 +1360,13 @@ def _python_apply_general( Series or DataFrame data after applying f """ - keys, values, mutated = self.grouper.apply(f, data, self.axis) + values, mutated = self.grouper.apply(f, data, self.axis) if not_indexed_same is None: not_indexed_same = mutated or self.mutated return self._wrap_applied_output( - data, keys, values, not_indexed_same=not_indexed_same + data, values, not_indexed_same=not_indexed_same ) @final diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e35f5331195fa..8090758d9cdc3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -690,7 +690,7 @@ def get_iterator( for each group """ splitter = self._get_splitter(data, axis=axis) - keys = self._get_group_keys() + keys = self.group_keys_seq for key, group in zip(keys, splitter): yield key, group.__finalize__(data, method="groupby") @@ -726,10 +726,15 @@ def _get_group_keys(self): return get_flattened_list(ids, ngroups, self.levels, self.codes) @final - def apply(self, f: F, data: FrameOrSeries, axis: int = 0): + @cache_readonly + def group_keys_seq(self): + return self._get_group_keys() + + @final + def apply(self, f: F, data: FrameOrSeries, axis: int = 0) -> tuple[list, bool]: mutated = self.mutated splitter = self._get_splitter(data, axis=axis) - group_keys = self._get_group_keys() + group_keys = self.group_keys_seq result_values = [] # This calls DataSplitter.__iter__ @@ -745,7 +750,7 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): mutated = True result_values.append(res) - return group_keys, result_values, mutated + return result_values, mutated @cache_readonly def indices(self): From 50349651c52ba2389a971b97708389d72baa76fb Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 9 Sep 2021 11:32:51 -0700 Subject: [PATCH 2/2] remoe _get_group_keys --- pandas/core/groupby/ops.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 8090758d9cdc3..d5569fb5f8a96 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -716,7 +716,8 @@ def _get_grouper(self): return self.groupings[0].grouping_vector @final - def _get_group_keys(self): + @cache_readonly + def group_keys_seq(self): if len(self.groupings) == 1: return self.levels[0] else: @@ -725,11 +726,6 @@ def _get_group_keys(self): # provide "flattened" iterator for multi-group setting return get_flattened_list(ids, ngroups, self.levels, self.codes) - @final - @cache_readonly - def group_keys_seq(self): - return self._get_group_keys() - @final def apply(self, f: F, data: FrameOrSeries, axis: int = 0) -> tuple[list, bool]: mutated = self.mutated