From d667aa2f8a8e6ea47bbeafa9e7ca00abc91f9e0c Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sun, 27 Feb 2022 14:34:33 -0800
Subject: [PATCH 1/3] REF: implement _reconstruct_ea_result,
 _ea_to_cython_values

---
 pandas/_libs/groupby.pyx   |  8 +++++
 pandas/core/groupby/ops.py | 63 ++++++++++++++++++++------------------
 2 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index e9d6b6813c6fd..79457bcd6506f 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1350,6 +1350,10 @@ cdef group_min_max(iu_64_floating_t[:, ::1] out,
                     else:
                         if uses_mask:
                             result_mask[i, j] = True
+                            # set out[i, j] to 0 to be deterministic, as
+                            #  it was initialized with np.empty. Also ensures
+                            #  we can downcast out if appropriate.
+                            out[i, j] = 0
                         else:
                             out[i, j] = nan_val
                 else:
@@ -1494,6 +1498,10 @@ cdef group_cummin_max(iu_64_floating_t[:, ::1] out,
                 if not skipna and na_possible and seen_na[lab, j]:
                     if uses_mask:
                         mask[i, j] = 1   # FIXME: shouldn't alter inplace
+                        # Set to 0 ensures that we are deterministic and can
+                        #  downcast if appropriate
+                        out[i, j] = 0
+
                     else:
                         out[i, j] = na_val
                 else:
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index b4fef6fcdff19..2853f7a6cc435 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -78,10 +78,7 @@
     Int64Dtype,
     IntegerDtype,
 )
-from pandas.core.arrays.masked import (
-    BaseMaskedArray,
-    BaseMaskedDtype,
-)
+from pandas.core.arrays.masked import BaseMaskedArray
 from pandas.core.arrays.string_ import StringDtype
 from pandas.core.frame import DataFrame
 from pandas.core.generic import NDFrame
@@ -339,6 +336,26 @@ def _ea_wrap_cython_operation(
                 **kwargs,
             )
 
+        npvalues = self._ea_to_cython_values(values)
+
+        res_values = self._cython_op_ndim_compat(
+            npvalues,
+            min_count=min_count,
+            ngroups=ngroups,
+            comp_ids=comp_ids,
+            mask=None,
+            **kwargs,
+        )
+
+        if self.how in ["rank"]:
+            # i.e. how in WrappedCythonOp.cast_blocklist, since
+            #  other cast_blocklist methods dont go through cython_operation
+            return res_values
+
+        return self._reconstruct_ea_result(values, res_values)
+
+    def _ea_to_cython_values(self, values: ExtensionArray):
+        # GH#43682
         if isinstance(values, (DatetimeArray, PeriodArray, TimedeltaArray)):
             # All of the functions implemented here are ordinal, so we can
             #  operate on the tz-naive equivalents
@@ -356,22 +373,7 @@ def _ea_wrap_cython_operation(
             raise NotImplementedError(
                 f"function is not implemented for this dtype: {values.dtype}"
             )
-
-        res_values = self._cython_op_ndim_compat(
-            npvalues,
-            min_count=min_count,
-            ngroups=ngroups,
-            comp_ids=comp_ids,
-            mask=None,
-            **kwargs,
-        )
-
-        if self.how in ["rank"]:
-            # i.e. how in WrappedCythonOp.cast_blocklist, since
-            #  other cast_blocklist methods dont go through cython_operation
-            return res_values
-
-        return self._reconstruct_ea_result(values, res_values)
+        return npvalues
 
     def _reconstruct_ea_result(self, values, res_values):
         """
@@ -387,6 +389,7 @@ def _reconstruct_ea_result(self, values, res_values):
             return cls._from_sequence(res_values, dtype=dtype)
 
         elif needs_i8_conversion(values.dtype):
+            assert res_values.dtype.kind != "f"  # just to be on the safe side
             i8values = res_values.view("i8")
             return type(values)(i8values, dtype=values.dtype)
 
@@ -422,14 +425,13 @@ def _masked_ea_wrap_cython_operation(
             **kwargs,
         )
 
-        dtype = self._get_result_dtype(orig_values.dtype)
-        assert isinstance(dtype, BaseMaskedDtype)
-        cls = dtype.construct_array_type()
-
         if self.kind != "aggregate":
-            return cls(res_values.astype(dtype.type, copy=False), mask)
+            res_mask = mask
         else:
-            return cls(res_values.astype(dtype.type, copy=False), result_mask)
+            res_mask = result_mask
+        # _cython_op_ndim_compat will return res_values with the correct
+        #  dtype, so we do not need to re-call _get_result_dtype and re-cast.
+        return orig_values._maybe_mask_result(res_values, res_mask)
 
     @final
     def _cython_op_ndim_compat(
@@ -577,9 +579,12 @@ def _call_cython_op(
                 cutoff = max(1, min_count)
                 empty_groups = counts < cutoff
                 if empty_groups.any():
-                    # Note: this conversion could be lossy, see GH#40767
-                    result = result.astype("float64")
-                    result[empty_groups] = np.nan
+                    if mask is not None and self.uses_mask():
+                        assert result_mask[empty_groups].all()
+                    else:
+                        # Note: this conversion could be lossy, see GH#40767
+                        result = result.astype("float64")
+                        result[empty_groups] = np.nan
 
         result = result.T
 

From 55f484d8002b66eb30353a0b5f521e5acf1dddd7 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sun, 27 Feb 2022 19:49:27 -0800
Subject: [PATCH 2/3] mypy fixup

---
 pandas/core/groupby/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 2853f7a6cc435..b7cee7b3475f0 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -579,7 +579,7 @@ def _call_cython_op(
                 cutoff = max(1, min_count)
                 empty_groups = counts < cutoff
                 if empty_groups.any():
-                    if mask is not None and self.uses_mask():
+                    if result_mask is not None and self.uses_mask():
                         assert result_mask[empty_groups].all()
                     else:
                         # Note: this conversion could be lossy, see GH#40767

From f0fc9fd44070a7e8387e643ebe80d82aea617ee9 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 28 Feb 2022 10:02:36 -0800
Subject: [PATCH 3/3] troubleshoot 32bit build

---
 pandas/core/groupby/ops.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index b7cee7b3475f0..8e38d542349d9 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -78,7 +78,10 @@
     Int64Dtype,
     IntegerDtype,
 )
-from pandas.core.arrays.masked import BaseMaskedArray
+from pandas.core.arrays.masked import (
+    BaseMaskedArray,
+    BaseMaskedDtype,
+)
 from pandas.core.arrays.string_ import StringDtype
 from pandas.core.frame import DataFrame
 from pandas.core.generic import NDFrame
@@ -425,13 +428,14 @@ def _masked_ea_wrap_cython_operation(
             **kwargs,
         )
 
+        dtype = self._get_result_dtype(orig_values.dtype)
+        assert isinstance(dtype, BaseMaskedDtype)
+        cls = dtype.construct_array_type()
+
         if self.kind != "aggregate":
-            res_mask = mask
+            return cls(res_values.astype(dtype.type, copy=False), mask)
         else:
-            res_mask = result_mask
-        # _cython_op_ndim_compat will return res_values with the correct
-        #  dtype, so we do not need to re-call _get_result_dtype and re-cast.
-        return orig_values._maybe_mask_result(res_values, res_mask)
+            return cls(res_values.astype(dtype.type, copy=False), result_mask)
 
     @final
     def _cython_op_ndim_compat(