From f8acf9a6a548dfd8d90b944cade338c7f83648b8 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 2 May 2021 14:46:10 -0700 Subject: [PATCH 1/3] REF: consolidate casting in groupby agg_series --- pandas/_libs/reduction.pyx | 8 +------- pandas/core/groupby/generic.py | 3 +-- pandas/core/groupby/ops.py | 18 ++++++++++-------- pandas/core/series.py | 2 +- 4 files changed, 13 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 191967585c431..1e56cdbb614ad 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -21,10 +21,7 @@ from pandas._libs.util cimport ( set_array_not_contiguous, ) -from pandas._libs.lib import ( - is_scalar, - maybe_convert_objects, -) +from pandas._libs.lib import is_scalar cpdef check_result_array(object obj): @@ -183,7 +180,6 @@ cdef class SeriesBinGrouper(_BaseGrouper): islider.reset() vslider.reset() - result = maybe_convert_objects(result) return result, counts @@ -284,8 +280,6 @@ cdef class SeriesGrouper(_BaseGrouper): # have result initialized by this point. assert initialized, "`result` has not been initialized." - result = maybe_convert_objects(result) - return result, counts diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b9f1ca0710872..ba7c409be222d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -364,8 +364,7 @@ def _cython_agg_general( # without special-casing, we would raise, then in fallback # would eventually call agg_series but without re-casting # to Categorical - # equiv: res_values, _ = self.grouper.agg_series(obj, alt) - res_values, _ = self.grouper._aggregate_series_pure_python(obj, alt) + res_values, _ = self.grouper.agg_series(obj, alt) else: # equiv: res_values = self._python_agg_general(alt) res_values = self._python_apply_general(alt, self._selected_obj) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 9edbeb412026d..d4b6555515054 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -972,20 +972,25 @@ def agg_series(self, obj: Series, func: F) -> tuple[ArrayLike, np.ndarray]: if len(obj) == 0: # SeriesGrouper would raise if we were to call _aggregate_series_fast - return self._aggregate_series_pure_python(obj, func) + result, counts = self._aggregate_series_pure_python(obj, func) elif is_extension_array_dtype(obj.dtype): # _aggregate_series_fast would raise TypeError when # calling libreduction.Slider # In the datetime64tz case it would incorrectly cast to tz-naive # TODO: can we get a performant workaround for EAs backed by ndarray? - return self._aggregate_series_pure_python(obj, func) + result, counts = self._aggregate_series_pure_python(obj, func) elif obj.index._has_complex_internals: # Preempt TypeError in _aggregate_series_fast - return self._aggregate_series_pure_python(obj, func) + result, counts = self._aggregate_series_pure_python(obj, func) - return self._aggregate_series_fast(obj, func) + else: + result, counts = self._aggregate_series_fast(obj, func) + + npvalues = lib.maybe_convert_objects(result, try_float=False) + out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) + return out, counts def _aggregate_series_fast( self, obj: Series, func: F @@ -1033,10 +1038,7 @@ def _aggregate_series_pure_python(self, obj: Series, func: F): counts[i] = group.shape[0] result[i] = res - npvalues = lib.maybe_convert_objects(result, try_float=False) - out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) - - return out, counts + return result, counts class BinGrouper(BaseGrouper): diff --git a/pandas/core/series.py b/pandas/core/series.py index 6a3e997a58754..f3f4e49b810bd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3086,7 +3086,7 @@ def combine(self, other, func, fill_value=None) -> Series: new_values[:] = [func(lv, other) for lv in self._values] new_name = self.name - # try_float=False is to match _aggregate_series_pure_python + # try_float=False is to match agg_series npvalues = lib.maybe_convert_objects(new_values, try_float=False) res_values = maybe_cast_pointwise_result(npvalues, self.dtype, same_dtype=False) return self._constructor(res_values, index=new_index, name=new_name) From 5cde819b0403ee106c29c7b0d747a24486067f82 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 2 May 2021 15:41:50 -0700 Subject: [PATCH 2/3] update tests --- pandas/core/groupby/generic.py | 7 ++++++- pandas/tests/groupby/test_bin_groupby.py | 6 +++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ba7c409be222d..0fbc146c4cfb0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -367,7 +367,12 @@ def _cython_agg_general( res_values, _ = self.grouper.agg_series(obj, alt) else: # equiv: res_values = self._python_agg_general(alt) - res_values = self._python_apply_general(alt, self._selected_obj) + # error: Incompatible types in assignment (expression has + # type "Union[DataFrame, Series]", variable has type + # "Union[ExtensionArray, ndarray]") + res_values = self._python_apply_general( # type: ignore[assignment] + alt, self._selected_obj + ) result = type(objvals)._from_sequence(res_values, dtype=objvals.dtype) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 13fddad30eeba..aa126ae801f1e 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -20,7 +20,7 @@ def test_series_grouper(): grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2) result, counts = grouper.get_result() - expected = np.array([obj[3:6].mean(), obj[6:].mean()]) + expected = np.array([obj[3:6].mean(), obj[6:].mean()], dtype=object) tm.assert_almost_equal(result, expected) exp_counts = np.array([3, 4], dtype=np.int64) @@ -36,7 +36,7 @@ def test_series_grouper_result_length_difference(): grouper = libreduction.SeriesGrouper(obj, lambda x: all(x > 0), labels, 2) result, counts = grouper.get_result() - expected = np.array([all(obj[3:6] > 0), all(obj[6:] > 0)]) + expected = np.array([all(obj[3:6] > 0), all(obj[6:] > 0)], dtype=object) tm.assert_equal(result, expected) exp_counts = np.array([3, 4], dtype=np.int64) @@ -61,7 +61,7 @@ def test_series_bin_grouper(): grouper = libreduction.SeriesBinGrouper(obj, np.mean, bins) result, counts = grouper.get_result() - expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()]) + expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()], dtype=object) tm.assert_almost_equal(result, expected) exp_counts = np.array([3, 3, 4], dtype=np.int64) From e885aa4de92c1af0463f085e4110148251d8d429 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 2 May 2021 19:43:43 -0700 Subject: [PATCH 3/3] conditional cast_back --- pandas/core/groupby/ops.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d4b6555515054..30be9968ad91f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -970,6 +970,7 @@ def agg_series(self, obj: Series, func: F) -> tuple[ArrayLike, np.ndarray]: # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 + cast_back = True if len(obj) == 0: # SeriesGrouper would raise if we were to call _aggregate_series_fast result, counts = self._aggregate_series_pure_python(obj, func) @@ -987,9 +988,14 @@ def agg_series(self, obj: Series, func: F) -> tuple[ArrayLike, np.ndarray]: else: result, counts = self._aggregate_series_fast(obj, func) + cast_back = False npvalues = lib.maybe_convert_objects(result, try_float=False) - out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) + if cast_back: + # TODO: Is there a documented reason why we dont always cast_back? + out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) + else: + out = npvalues return out, counts def _aggregate_series_fast(