From 6c2f679890361681d5e15155c0c0f167d4dd806a Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 3 Mar 2022 16:31:10 -0800 Subject: [PATCH 1/2] BUG: GroupBy.cummin altering nullable arrays inplace --- pandas/_libs/groupby.pyx | 48 ++++++++++++++++----------- pandas/core/groupby/ops.py | 18 +++++----- pandas/tests/groupby/test_function.py | 4 +++ 3 files changed, 41 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d250a69c1985b..d4d84a4d12860 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1459,7 +1459,8 @@ def group_min( cdef group_cummin_max( iu_64_floating_t[:, ::1] out, ndarray[iu_64_floating_t, ndim=2] values, - uint8_t[:, ::1] mask, + const uint8_t[:, ::1] mask, + uint8_t[:, ::1] result_mask, const intp_t[::1] labels, int ngroups, bint is_datetimelike, @@ -1478,6 +1479,9 @@ cdef group_cummin_max( mask : np.ndarray[bool] or None If not None, indices represent missing values, otherwise the mask will not be used + result_mask : ndarray[bool, ndim=2], optional + If not None, these specify locations in the output that are NA. + Modified in-place. labels : np.ndarray[np.intp] Labels to group by. ngroups : int @@ -1540,7 +1544,7 @@ cdef group_cummin_max( if not skipna and na_possible and seen_na[lab, j]: if uses_mask: - mask[i, j] = 1 # FIXME: shouldn't alter inplace + result_mask[i, j] = 1 # Set to 0 ensures that we are deterministic and can # downcast if appropriate out[i, j] = 0 @@ -1577,19 +1581,21 @@ def group_cummin( const intp_t[::1] labels, int ngroups, bint is_datetimelike, - uint8_t[:, ::1] mask=None, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: """See group_cummin_max.__doc__""" group_cummin_max( - out, - values, - mask, - labels, - ngroups, - is_datetimelike, - skipna, - compute_max=False + out=out, + values=values, + mask=mask, + result_mask=result_mask, + labels=labels, + ngroups=ngroups, + is_datetimelike=is_datetimelike, + skipna=skipna, + compute_max=False, ) @@ -1601,17 +1607,19 @@ def group_cummax( const intp_t[::1] labels, int ngroups, bint is_datetimelike, - uint8_t[:, ::1] mask=None, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: """See group_cummin_max.__doc__""" group_cummin_max( - out, - values, - mask, - labels, - ngroups, - is_datetimelike, - skipna, - compute_max=True + out=out, + values=values, + mask=mask, + result_mask=result_mask, + labels=labels, + ngroups=ngroups, + is_datetimelike=is_datetimelike, + skipna=skipna, + compute_max=True, ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e984279c17a65..7a25a6b9008dd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -418,9 +418,13 @@ def _masked_ea_wrap_cython_operation( """ orig_values = values - # Copy to ensure input and result masks don't end up shared - mask = values._mask.copy() - result_mask = np.zeros(ngroups, dtype=bool) + # libgroupby functions are responsible for NOT altering mask + mask = values._mask + if self.kind != "aggregate": + result_mask = mask.copy() + else: + result_mask = np.zeros(ngroups, dtype=bool) + arr = values._data res_values = self._cython_op_ndim_compat( @@ -438,12 +442,7 @@ def _masked_ea_wrap_cython_operation( # dtype; last attempt ran into trouble on 32bit linux build res_values = res_values.astype(dtype.type, copy=False) - if self.kind != "aggregate": - out_mask = mask - else: - out_mask = result_mask - - return orig_values._maybe_mask_result(res_values, out_mask) + return orig_values._maybe_mask_result(res_values, result_mask) @final def _cython_op_ndim_compat( @@ -579,6 +578,7 @@ def _call_cython_op( ngroups=ngroups, is_datetimelike=is_datetimelike, mask=mask, + result_mask=result_mask, **kwargs, ) else: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 1555e9d02c8ca..ec7f3200b682b 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -843,11 +843,15 @@ def test_cummax(dtypes_for_minmax): def test_cummin_max_skipna(method, dtype, groups, expected_data): # GH-34047 df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) + orig = df.copy() gb = df.groupby(groups)["a"] result = getattr(gb, method)(skipna=False) expected = Series(expected_data, dtype=dtype, name="a") + # check we didn't accidentally alter df + tm.assert_frame_equal(df, orig) + tm.assert_series_equal(result, expected) From bfd5aaba67b778e73fa19cbec6f4aafb8ea21d78 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 4 Mar 2022 12:09:23 -0800 Subject: [PATCH 2/2] whatnsew --- doc/source/whatsnew/v1.5.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index adee9271c23b0..2db1a636bcf0b 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -429,6 +429,8 @@ Groupby/resample/rolling - Bug in :meth:`.ExponentialMovingWindow.mean` with ``axis=1`` and ``engine='numba'`` when the :class:`DataFrame` has more columns than rows (:issue:`46086`) - Bug when using ``engine="numba"`` would return the same jitted function when modifying ``engine_kwargs`` (:issue:`46086`) - Bug in :meth:`.DataFrameGroupby.transform` fails when ``axis=1`` and ``func`` is ``"first"`` or ``"last"`` (:issue:`45986`) +- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` with nullable dtypes incorrectly altering the original data in place (:issue:`46220`) +- Reshaping ^^^^^^^^^