From db3786eecd4eaeff2d02d03ce4f609d1c9bf44a4 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Feb 2023 09:14:31 -0800 Subject: [PATCH 1/2] PERF: groupby iteration --- pandas/core/generic.py | 26 ++++++++++++++++++++++++++ pandas/core/groupby/ops.py | 22 +++++++++++++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e92059c552b65..db53db4b7617e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -469,6 +469,32 @@ def _validate_dtype(cls, dtype) -> DtypeObj | None: return dtype + @property + def _can_fast_construct(self) -> bool_t: + """ + Check if we can avoid _constructor lookup and __finalize__ calls. + """ + # GH#46505 repeated __finalize__ calls caused perf regression, + # see if we can avoid those in some cases. + from pandas import ( + DataFrame, + Series, + ) + + if self.attrs: + # __finalize__ is not a no-op + return False + + if type(self) is Series: + if self._metadata == ["name"]: + return True + return False + elif type(self) is DataFrame: + return not self._metadata + + # subclass, may have overridden _constructor -> cannot fast-construct + return False + # ---------------------------------------------------------------------- # Construction diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 92c36db397f60..ca22d3d8f53b9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1264,8 +1264,13 @@ def __iter__(self) -> Iterator: starts, ends = lib.generate_slices(self._slabels, self.ngroups) + if sdata._can_fast_construct: + chop = self._chop_fast + else: + chop = self._chop + for start, end in zip(starts, ends): - yield self._chop(sdata, slice(start, end)) + yield chop(sdata, slice(start, end)) @cache_readonly def _sorted_data(self) -> NDFrameT: @@ -1274,6 +1279,9 @@ def _sorted_data(self) -> NDFrameT: def _chop(self, sdata, slice_obj: slice) -> NDFrame: raise AbstractMethodError(self) + def _chop_fast(self, sdata, slice_obj: slice) -> NDFrame: + raise AbstractMethodError(self) + class SeriesSplitter(DataSplitter): def _chop(self, sdata: Series, slice_obj: slice) -> Series: @@ -1282,6 +1290,12 @@ def _chop(self, sdata: Series, slice_obj: slice) -> Series: ser = sdata._constructor(mgr, name=sdata.name, fastpath=True) return ser.__finalize__(sdata, method="groupby") + def _chop_fast(self, sdata: Series, slice_obj: slice) -> Series: + # _chop specialized to cast with _can_fast_construct + mgr = sdata._mgr.get_slice(slice_obj) + ser = Series(mgr, name=sdata.name, fastpath=True) + return ser + class FrameSplitter(DataSplitter): def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: @@ -1294,6 +1308,12 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: df = sdata._constructor(mgr) return df.__finalize__(sdata, method="groupby") + def _chop_fast(self, sdata: Series, slice_obj: slice) -> Series: + # _chop specialized to cast with _can_fast_construct + mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) + df = DataFrame(mgr) + return df + def get_splitter( data: NDFrame, labels: np.ndarray, ngroups: int, axis: AxisInt = 0 From 65d7b5a282fd6039363fbff0d8756e2f49476803 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Feb 2023 13:42:44 -0800 Subject: [PATCH 2/2] mypy fixup --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ca22d3d8f53b9..f82a3122b800c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1308,7 +1308,7 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: df = sdata._constructor(mgr) return df.__finalize__(sdata, method="groupby") - def _chop_fast(self, sdata: Series, slice_obj: slice) -> Series: + def _chop_fast(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: # _chop specialized to cast with _can_fast_construct mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) df = DataFrame(mgr)