From ded8f72c518c7321b92fe5bffb5ba53eb0aafd41 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 30 Apr 2021 14:36:03 -0700 Subject: [PATCH 1/3] REF: simplify cython_agg_general --- pandas/core/groupby/generic.py | 56 ++++++++++++++-------------------- pandas/core/groupby/groupby.py | 4 +-- 2 files changed, 24 insertions(+), 36 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ab03cce0d6476..65e81adb9328f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -345,47 +345,37 @@ def _aggregate_multiple_funcs(self, arg): def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): - output: dict[base.OutputKey, ArrayLike] = {} - # Ideally we would be able to enumerate self._iterate_slices and use - # the index from enumeration as the key of output, but ohlc in particular - # returns a (n x 4) array. Output requires 1D ndarrays as values, so we - # need to slice that up into 1D arrays - idx = 0 - for obj in self._iterate_slices(): - name = obj.name - is_numeric = is_numeric_dtype(obj.dtype) - if numeric_only and not is_numeric: - continue - objvals = obj._values + obj = self._selected_obj - if isinstance(objvals, Categorical): - if self.grouper.ngroups > 0: - # without special-casing, we would raise, then in fallback - # would eventually call agg_series but without re-casting - # to Categorical - # equiv: res_values, _ = self.grouper.agg_series(obj, alt) - res_values, _ = self.grouper._aggregate_series_pure_python(obj, alt) - else: - # equiv: res_values = self._python_agg_general(alt) - res_values = self._python_apply_general(alt, self._selected_obj) + is_numeric = is_numeric_dtype(obj.dtype) + if numeric_only and not is_numeric: + raise DataError("No numeric types to aggregate") - result = type(objvals)._from_sequence(res_values, dtype=objvals.dtype) + objvals = obj._values + if isinstance(objvals, Categorical): + if self.grouper.ngroups > 0: + # without special-casing, we would raise, then in fallback + # would eventually call agg_series but without re-casting + # to Categorical + # equiv: res_values, _ = self.grouper.agg_series(obj, alt) + res_values, _ = self.grouper._aggregate_series_pure_python(obj, alt) else: - result = self.grouper._cython_operation( - "aggregate", obj._values, how, axis=0, min_count=min_count - ) + # equiv: res_values = self._python_agg_general(alt) + res_values = self._python_apply_general(alt, self._selected_obj) - assert result.ndim == 1 - key = base.OutputKey(label=name, position=idx) - output[key] = result - idx += 1 + result = type(objvals)._from_sequence(res_values, dtype=objvals.dtype) - if not output: - raise DataError("No numeric types to aggregate") + else: + result = self.grouper._cython_operation( + "aggregate", obj._values, how, axis=0, min_count=min_count + ) - return self._wrap_aggregated_output(output) + ser = self.obj._constructor( + result, index=self.grouper.result_index, name=obj.name + ) + return self._reindex_output(ser) def _wrap_aggregated_output( self, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7fe9d7cb49eb5..29e76d8231562 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1282,9 +1282,7 @@ def _agg_general( except DataError: pass except NotImplementedError as err: - if "function is not implemented for this dtype" in str( - err - ) or "category dtype not supported" in str(err): + if "function is not implemented for this dtype" in str(err): # raised in _get_cython_function, in some cases can # be trimmed by implementing cython funcs for more dtypes pass From 9b9ae13efc37a7f64d0c1eed2ede38093847ae40 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 30 Apr 2021 14:59:47 -0700 Subject: [PATCH 2/3] remove unnecessary consolidate --- pandas/core/groupby/groupby.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 29e76d8231562..8640298ac39cd 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1815,9 +1815,7 @@ def describe(self, **kwargs): result = self.apply(lambda x: x.describe(**kwargs)) if self.axis == 1: return result.T - # FIXME: not being consolidated breaks - # test_describe_with_duplicate_output_column_names - return result._consolidate().unstack() + return result.unstack() @final def resample(self, rule, *args, **kwargs): From c2ba234ce06896032407ad2776a9a2e411b4d97c Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 2 May 2021 13:02:04 -0700 Subject: [PATCH 3/3] revert everything but describe --- pandas/core/groupby/generic.py | 56 ++++++++++++++++----------- pandas/core/groupby/groupby.py | 4 +- pandas/tests/groupby/test_function.py | 1 + 3 files changed, 37 insertions(+), 24 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e716e05850dbd..b9f1ca0710872 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -345,37 +345,47 @@ def _aggregate_multiple_funcs(self, arg): def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): + output: dict[base.OutputKey, ArrayLike] = {} + # Ideally we would be able to enumerate self._iterate_slices and use + # the index from enumeration as the key of output, but ohlc in particular + # returns a (n x 4) array. Output requires 1D ndarrays as values, so we + # need to slice that up into 1D arrays + idx = 0 + for obj in self._iterate_slices(): + name = obj.name + is_numeric = is_numeric_dtype(obj.dtype) + if numeric_only and not is_numeric: + continue - obj = self._selected_obj + objvals = obj._values - is_numeric = is_numeric_dtype(obj.dtype) - if numeric_only and not is_numeric: - raise DataError("No numeric types to aggregate") + if isinstance(objvals, Categorical): + if self.grouper.ngroups > 0: + # without special-casing, we would raise, then in fallback + # would eventually call agg_series but without re-casting + # to Categorical + # equiv: res_values, _ = self.grouper.agg_series(obj, alt) + res_values, _ = self.grouper._aggregate_series_pure_python(obj, alt) + else: + # equiv: res_values = self._python_agg_general(alt) + res_values = self._python_apply_general(alt, self._selected_obj) - objvals = obj._values + result = type(objvals)._from_sequence(res_values, dtype=objvals.dtype) - if isinstance(objvals, Categorical): - if self.grouper.ngroups > 0: - # without special-casing, we would raise, then in fallback - # would eventually call agg_series but without re-casting - # to Categorical - # equiv: res_values, _ = self.grouper.agg_series(obj, alt) - res_values, _ = self.grouper._aggregate_series_pure_python(obj, alt) else: - # equiv: res_values = self._python_agg_general(alt) - res_values = self._python_apply_general(alt, self._selected_obj) + result = self.grouper._cython_operation( + "aggregate", obj._values, how, axis=0, min_count=min_count + ) - result = type(objvals)._from_sequence(res_values, dtype=objvals.dtype) + assert result.ndim == 1 + key = base.OutputKey(label=name, position=idx) + output[key] = result + idx += 1 - else: - result = self.grouper._cython_operation( - "aggregate", obj._values, how, axis=0, min_count=min_count - ) + if not output: + raise DataError("No numeric types to aggregate") - ser = self.obj._constructor( - result, index=self.grouper.result_index, name=obj.name - ) - return self._reindex_output(ser) + return self._wrap_aggregated_output(output) def _wrap_aggregated_output( self, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8640298ac39cd..36c7f53d23098 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1282,7 +1282,9 @@ def _agg_general( except DataError: pass except NotImplementedError as err: - if "function is not implemented for this dtype" in str(err): + if "function is not implemented for this dtype" in str( + err + ) or "category dtype not supported" in str(err): # raised in _get_cython_function, in some cases can # be trimmed by implementing cython funcs for more dtypes pass diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 40f8135637292..163303168c240 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1077,6 +1077,7 @@ def test_describe_with_duplicate_output_column_names(as_index): "c": [10, 20, 30, 40, 50, 60], }, columns=["a", "b", "b"], + copy=False, ) expected = (