From 4cebd254207748fa21c92a91a7e37d9597d41036 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 7 May 2021 12:00:45 -0700 Subject: [PATCH 1/3] CLN: use RangeIndex for groupby defaults --- pandas/core/groupby/generic.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9287163053cac..fc3e93546595f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -90,7 +90,6 @@ MultiIndex, all_indexes_same, ) -import pandas.core.indexes.base as ibase from pandas.core.series import Series from pandas.core.util.numba_ import maybe_use_numba @@ -1611,8 +1610,8 @@ def _wrap_transformed_output( def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: if not self.as_index: - index = np.arange(mgr.shape[1]) - mgr.set_axis(1, ibase.Index(index)) + index = Index(range(mgr.shape[1])) + mgr.set_axis(1, index) result = self.obj._constructor(mgr) self._insert_inaxis_grouper_inplace(result) @@ -1761,7 +1760,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: results._get_axis(other_axis).names = obj._get_axis(other_axis).names if not self.as_index: - results.index = ibase.default_index(len(results)) + results.index = Index(range(len(results))) self._insert_inaxis_grouper_inplace(results) return results From ee63f4447d6d465bc50017b84382140eaa8f1c51 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 7 May 2021 18:59:57 -0700 Subject: [PATCH 2/3] early-return, better names, RangeIndex --- pandas/core/groupby/generic.py | 19 +++++++++++-------- pandas/core/groupby/ops.py | 6 +++--- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index fc3e93546595f..e4227af742f7c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -481,14 +481,13 @@ def _get_index() -> Index: if isinstance(values[0], dict): # GH #823 #24880 index = _get_index() - result: FrameOrSeriesUnion = self._reindex_output( - self.obj._constructor_expanddim(values, index=index) - ) + res_df = self.obj._constructor_expanddim(values, index=index) + res_df = self._reindex_output(res_df) # if self.observed is False, # keep all-NaN rows created while re-indexing - result = result.stack(dropna=self.observed) - result.name = self._selection_name - return result + res_ser = res_df.stack(dropna=self.observed) + res_ser.name = self._selection_name + return res_ser elif isinstance(values[0], (Series, DataFrame)): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: @@ -1005,7 +1004,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) elif self.axis == 1: # _aggregate_multiple_funcs does not allow self.axis == 1 + # Note: axis == 1 precludes 'not self.as_index', see __init__ result = self._aggregate_frame(func) + return result else: @@ -1035,7 +1036,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) if not self.as_index: self._insert_inaxis_grouper_inplace(result) - result.index = np.arange(len(result)) + result.index = Index(range(len(result))) return result._convert(datetime=True) @@ -1161,7 +1162,9 @@ def _wrap_applied_output(self, data, keys, values, not_indexed_same=False): if self.as_index: return self.obj._constructor_sliced(values, index=key_index) else: - result = DataFrame(values, index=key_index, columns=[self._selection]) + result = self.obj._constructor( + values, index=key_index, columns=[self._selection] + ) self._insert_inaxis_grouper_inplace(result) return result else: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 46b47bc29d8a6..3045451974ee7 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -889,9 +889,8 @@ def codes_info(self) -> np.ndarray: @final def _get_compressed_codes(self) -> tuple[np.ndarray, np.ndarray]: - all_codes = self.codes - if len(all_codes) > 1: - group_index = get_group_index(all_codes, self.shape, sort=True, xnull=True) + if len(self.groupings) > 1: + group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self.sort) ping = self.groupings[0] @@ -1111,6 +1110,7 @@ def groups(self): @property def nkeys(self) -> int: + # still matches len(self.groupings), but we can hard-code return 1 def _get_grouper(self): From 40093ac13e397a90d01691375ff0c60abde3195a Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 7 May 2021 22:38:16 -0700 Subject: [PATCH 3/3] comments, test --- pandas/core/groupby/generic.py | 3 +++ pandas/tests/groupby/test_groupby.py | 23 ++++++++++++----------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e4227af742f7c..df0a413b7a76a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -998,8 +998,11 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # grouper specific aggregations if self.grouper.nkeys > 1: + # test_groupby_as_index_series_scalar gets here with 'not self.as_index' return self._python_agg_general(func, *args, **kwargs) elif args or kwargs: + # test_pass_args_kwargs gets here (with and without as_index) + # can't return early result = self._aggregate_frame(func, *args, **kwargs) elif self.axis == 1: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index f716a3a44cd54..44d48c45e1fd1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -234,17 +234,18 @@ def f(x, q=None, axis=0): tm.assert_series_equal(trans_result, trans_expected) # DataFrame - df_grouped = tsframe.groupby(lambda x: x.month) - agg_result = df_grouped.agg(np.percentile, 80, axis=0) - apply_result = df_grouped.apply(DataFrame.quantile, 0.8) - expected = df_grouped.quantile(0.8) - tm.assert_frame_equal(apply_result, expected, check_names=False) - tm.assert_frame_equal(agg_result, expected) - - agg_result = df_grouped.agg(f, q=80) - apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) - tm.assert_frame_equal(agg_result, expected) - tm.assert_frame_equal(apply_result, expected, check_names=False) + for as_index in [True, False]: + df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index) + agg_result = df_grouped.agg(np.percentile, 80, axis=0) + apply_result = df_grouped.apply(DataFrame.quantile, 0.8) + expected = df_grouped.quantile(0.8) + tm.assert_frame_equal(apply_result, expected, check_names=False) + tm.assert_frame_equal(agg_result, expected) + + agg_result = df_grouped.agg(f, q=80) + apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) + tm.assert_frame_equal(agg_result, expected) + tm.assert_frame_equal(apply_result, expected, check_names=False) def test_len():