From 0c12f1353f291b32831eb97b14c4ed9433da16f3 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 7 May 2021 16:41:09 -0700 Subject: [PATCH 1/2] REF: document casting behavior in groupby --- pandas/core/groupby/generic.py | 10 +++------- pandas/core/groupby/groupby.py | 5 ++++- pandas/core/groupby/ops.py | 25 ++++++++++++++++++++----- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9287163053cac..ffadffd8e33e1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -541,9 +541,6 @@ def _transform_general(self, func: Callable, *args, **kwargs) -> Series: object.__setattr__(group, "name", name) res = func(group, *args, **kwargs) - if isinstance(res, (DataFrame, Series)): - res = res._values - results.append(klass(res, index=group.index)) # check for empty "results" to avoid concat ValueError @@ -1236,12 +1233,11 @@ def _wrap_applied_output_series( columns = key_index stacked_values = stacked_values.T + if stacked_values.dtype == object: + # We'll have the DataFrame constructor do inference + stacked_values = stacked_values.tolist() result = self.obj._constructor(stacked_values, index=index, columns=columns) - # if we have date/time like in the original, then coerce dates - # as we are stacking can easily have object dtypes here - result = result._convert(datetime=True) - if not self.as_index: self._insert_inaxis_grouper_inplace(result) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1105c1bd1d782..c4d4132932f4f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1331,7 +1331,10 @@ def _agg_py_fallback( # reductions; see GH#28949 ser = df.iloc[:, 0] - res_values = self.grouper.agg_series(ser, alt) + # We do not get here with UDFs, so we know that our dtype + # should always be preserved by the implemented aggregations + # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype? + res_values = self.grouper.agg_series(ser, alt, preserve=True) if isinstance(values, Categorical): # Because we only get here with known dtype-preserving diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 46b47bc29d8a6..fe737e2b27aea 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -967,11 +967,22 @@ def _cython_operation( ) @final - def agg_series(self, obj: Series, func: F) -> ArrayLike: + def agg_series(self, obj: Series, func: F, preserve: bool = False) -> ArrayLike: + """ + Parameters + ---------- + obj : Series + func : function taking a Series and returning a scalar-like + preserve : bool + Whether the aggregation is known to be dtype-preserving. + + Returns + ------- + np.ndarray or ExtensionArray + """ # test_groupby_empty_with_category gets here with self.ngroups == 0 # and len(obj) > 0 - cast_back = True if len(obj) == 0: # SeriesGrouper would raise if we were to call _aggregate_series_fast result = self._aggregate_series_pure_python(obj, func) @@ -983,17 +994,21 @@ def agg_series(self, obj: Series, func: F) -> ArrayLike: # TODO: can we get a performant workaround for EAs backed by ndarray? result = self._aggregate_series_pure_python(obj, func) + # we can preserve a little bit more aggressively with EA dtype + # because maybe_cast_pointwise_result will do a try/except + # with _from_sequence. NB we are assuming here that _from_sequence + # is sufficiently strict that it casts appropriately. + preserve = True + elif obj.index._has_complex_internals: # Preempt TypeError in _aggregate_series_fast result = self._aggregate_series_pure_python(obj, func) else: result = self._aggregate_series_fast(obj, func) - cast_back = False npvalues = lib.maybe_convert_objects(result, try_float=False) - if cast_back: - # TODO: Is there a documented reason why we dont always cast_back? + if preserve: out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) else: out = npvalues From a347b579ae8682b46303ac14c78aa90963303f52 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 17 May 2021 08:56:53 -0700 Subject: [PATCH 2/2] preserve -> preserve_dtype --- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/ops.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7acd4c96bdf45..0b07668a9fea2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1332,7 +1332,7 @@ def _agg_py_fallback( # We do not get here with UDFs, so we know that our dtype # should always be preserved by the implemented aggregations # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype? - res_values = self.grouper.agg_series(ser, alt, preserve=True) + res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True) if isinstance(values, Categorical): # Because we only get here with known dtype-preserving diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 279c8759ec6da..8b6136b3abc42 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -966,13 +966,15 @@ def _cython_operation( ) @final - def agg_series(self, obj: Series, func: F, preserve: bool = False) -> ArrayLike: + def agg_series( + self, obj: Series, func: F, preserve_dtype: bool = False + ) -> ArrayLike: """ Parameters ---------- obj : Series func : function taking a Series and returning a scalar-like - preserve : bool + preserve_dtype : bool Whether the aggregation is known to be dtype-preserving. Returns @@ -997,7 +999,7 @@ def agg_series(self, obj: Series, func: F, preserve: bool = False) -> ArrayLike: # because maybe_cast_pointwise_result will do a try/except # with _from_sequence. NB we are assuming here that _from_sequence # is sufficiently strict that it casts appropriately. - preserve = True + preserve_dtype = True elif obj.index._has_complex_internals: # Preempt TypeError in _aggregate_series_fast @@ -1007,7 +1009,7 @@ def agg_series(self, obj: Series, func: F, preserve: bool = False) -> ArrayLike: result = self._aggregate_series_fast(obj, func) npvalues = lib.maybe_convert_objects(result, try_float=False) - if preserve: + if preserve_dtype: out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) else: out = npvalues