From d667aa2f8a8e6ea47bbeafa9e7ca00abc91f9e0c Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 27 Feb 2022 14:34:33 -0800 Subject: [PATCH 1/3] REF: implement _reconstruct_ea_result, _ea_to_cython_values --- pandas/_libs/groupby.pyx | 8 +++++ pandas/core/groupby/ops.py | 63 ++++++++++++++++++++------------------ 2 files changed, 42 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e9d6b6813c6fd..79457bcd6506f 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1350,6 +1350,10 @@ cdef group_min_max(iu_64_floating_t[:, ::1] out, else: if uses_mask: result_mask[i, j] = True + # set out[i, j] to 0 to be deterministic, as + # it was initialized with np.empty. Also ensures + # we can downcast out if appropriate. + out[i, j] = 0 else: out[i, j] = nan_val else: @@ -1494,6 +1498,10 @@ cdef group_cummin_max(iu_64_floating_t[:, ::1] out, if not skipna and na_possible and seen_na[lab, j]: if uses_mask: mask[i, j] = 1 # FIXME: shouldn't alter inplace + # Set to 0 ensures that we are deterministic and can + # downcast if appropriate + out[i, j] = 0 + else: out[i, j] = na_val else: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b4fef6fcdff19..2853f7a6cc435 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -78,10 +78,7 @@ Int64Dtype, IntegerDtype, ) -from pandas.core.arrays.masked import ( - BaseMaskedArray, - BaseMaskedDtype, -) +from pandas.core.arrays.masked import BaseMaskedArray from pandas.core.arrays.string_ import StringDtype from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame @@ -339,6 +336,26 @@ def _ea_wrap_cython_operation( **kwargs, ) + npvalues = self._ea_to_cython_values(values) + + res_values = self._cython_op_ndim_compat( + npvalues, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + mask=None, + **kwargs, + ) + + if self.how in ["rank"]: + # i.e. how in WrappedCythonOp.cast_blocklist, since + # other cast_blocklist methods dont go through cython_operation + return res_values + + return self._reconstruct_ea_result(values, res_values) + + def _ea_to_cython_values(self, values: ExtensionArray): + # GH#43682 if isinstance(values, (DatetimeArray, PeriodArray, TimedeltaArray)): # All of the functions implemented here are ordinal, so we can # operate on the tz-naive equivalents @@ -356,22 +373,7 @@ def _ea_wrap_cython_operation( raise NotImplementedError( f"function is not implemented for this dtype: {values.dtype}" ) - - res_values = self._cython_op_ndim_compat( - npvalues, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - mask=None, - **kwargs, - ) - - if self.how in ["rank"]: - # i.e. how in WrappedCythonOp.cast_blocklist, since - # other cast_blocklist methods dont go through cython_operation - return res_values - - return self._reconstruct_ea_result(values, res_values) + return npvalues def _reconstruct_ea_result(self, values, res_values): """ @@ -387,6 +389,7 @@ def _reconstruct_ea_result(self, values, res_values): return cls._from_sequence(res_values, dtype=dtype) elif needs_i8_conversion(values.dtype): + assert res_values.dtype.kind != "f" # just to be on the safe side i8values = res_values.view("i8") return type(values)(i8values, dtype=values.dtype) @@ -422,14 +425,13 @@ def _masked_ea_wrap_cython_operation( **kwargs, ) - dtype = self._get_result_dtype(orig_values.dtype) - assert isinstance(dtype, BaseMaskedDtype) - cls = dtype.construct_array_type() - if self.kind != "aggregate": - return cls(res_values.astype(dtype.type, copy=False), mask) + res_mask = mask else: - return cls(res_values.astype(dtype.type, copy=False), result_mask) + res_mask = result_mask + # _cython_op_ndim_compat will return res_values with the correct + # dtype, so we do not need to re-call _get_result_dtype and re-cast. + return orig_values._maybe_mask_result(res_values, res_mask) @final def _cython_op_ndim_compat( @@ -577,9 +579,12 @@ def _call_cython_op( cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): - # Note: this conversion could be lossy, see GH#40767 - result = result.astype("float64") - result[empty_groups] = np.nan + if mask is not None and self.uses_mask(): + assert result_mask[empty_groups].all() + else: + # Note: this conversion could be lossy, see GH#40767 + result = result.astype("float64") + result[empty_groups] = np.nan result = result.T From 55f484d8002b66eb30353a0b5f521e5acf1dddd7 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 27 Feb 2022 19:49:27 -0800 Subject: [PATCH 2/3] mypy fixup --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 2853f7a6cc435..b7cee7b3475f0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -579,7 +579,7 @@ def _call_cython_op( cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): - if mask is not None and self.uses_mask(): + if result_mask is not None and self.uses_mask(): assert result_mask[empty_groups].all() else: # Note: this conversion could be lossy, see GH#40767 From f0fc9fd44070a7e8387e643ebe80d82aea617ee9 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 28 Feb 2022 10:02:36 -0800 Subject: [PATCH 3/3] troubleshoot 32bit build --- pandas/core/groupby/ops.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b7cee7b3475f0..8e38d542349d9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -78,7 +78,10 @@ Int64Dtype, IntegerDtype, ) -from pandas.core.arrays.masked import BaseMaskedArray +from pandas.core.arrays.masked import ( + BaseMaskedArray, + BaseMaskedDtype, +) from pandas.core.arrays.string_ import StringDtype from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame @@ -425,13 +428,14 @@ def _masked_ea_wrap_cython_operation( **kwargs, ) + dtype = self._get_result_dtype(orig_values.dtype) + assert isinstance(dtype, BaseMaskedDtype) + cls = dtype.construct_array_type() + if self.kind != "aggregate": - res_mask = mask + return cls(res_values.astype(dtype.type, copy=False), mask) else: - res_mask = result_mask - # _cython_op_ndim_compat will return res_values with the correct - # dtype, so we do not need to re-call _get_result_dtype and re-cast. - return orig_values._maybe_mask_result(res_values, res_mask) + return cls(res_values.astype(dtype.type, copy=False), result_mask) @final def _cython_op_ndim_compat(