diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index b51cc9f4500c1..1f4ac94b6e2b9 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -32,7 +32,7 @@ def group_shift_indexer( periods: int, ) -> None: ... def group_fillna_indexer( - out: np.ndarray, # ndarray[int64_t] + out: np.ndarray, # ndarray[intp_t] labels: np.ndarray, # ndarray[int64_t] mask: np.ndarray, # ndarray[uint8_t] direction: Literal["ffill", "bfill"], diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 8ba49e5753bd5..4362743b55396 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -321,7 +321,7 @@ def group_shift_indexer(int64_t[::1] out, const intp_t[::1] labels, @cython.wraparound(False) @cython.boundscheck(False) -def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels, +def group_fillna_indexer(ndarray[intp_t] out, ndarray[intp_t] labels, ndarray[uint8_t] mask, str direction, int64_t limit, bint dropna) -> None: """ @@ -329,7 +329,7 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels, Parameters ---------- - out : np.ndarray[np.int64] + out : np.ndarray[np.intp] Values into which this method will write its results. labels : np.ndarray[np.intp] Array containing unique label for each group, with its ordering diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 85e7c9a62b2d4..4afe42704d7ed 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1174,7 +1174,10 @@ def _wrap_transformed_output( Series or DataFrame Series for SeriesGroupBy, DataFrame for DataFrameGroupBy """ - result = self._indexed_output_to_ndframe(output) + if isinstance(output, (Series, DataFrame)): + result = output + else: + result = self._indexed_output_to_ndframe(output) if self.axis == 1: # Only relevant for DataFrameGroupBy @@ -2258,17 +2261,55 @@ def _fill(self, direction: Literal["ffill", "bfill"], limit=None): if limit is None: limit = -1 - return self._get_cythonized_result( + ids, _, _ = self.grouper.group_info + + col_func = partial( libgroupby.group_fillna_indexer, - numeric_only=False, - needs_mask=True, - cython_dtype=np.dtype(np.int64), - result_is_index=True, + labels=ids, direction=direction, limit=limit, dropna=self.dropna, ) + def blk_func(values: ArrayLike) -> ArrayLike: + mask = isna(values) + if values.ndim == 1: + indexer = np.empty(values.shape, dtype=np.intp) + col_func(out=indexer, mask=mask) + return algorithms.take_nd(values, indexer) + + else: + # We broadcast algorithms.take_nd analogous to + # np.take_along_axis + + # Note: we only get here with backfill/pad, + # so if we have a dtype that cannot hold NAs, + # then there will be no -1s in indexer, so we can use + # the original dtype (no need to ensure_dtype_can_hold_na) + if isinstance(values, np.ndarray): + out = np.empty(values.shape, dtype=values.dtype) + else: + out = type(values)._empty(values.shape, dtype=values.dtype) + + for i in range(len(values)): + # call group_fillna_indexer column-wise + indexer = np.empty(values.shape[1], dtype=np.intp) + col_func(out=indexer, mask=mask[i]) + out[i, :] = algorithms.take_nd(values[i], indexer) + return out + + obj = self._obj_with_exclusions + if self.axis == 1: + obj = obj.T + mgr = obj._mgr + res_mgr = mgr.apply(blk_func) + + new_obj = obj._constructor(res_mgr) + if isinstance(new_obj, Series): + new_obj.name = obj.name + + return self._wrap_transformed_output(new_obj) + @final @Substitution(name="groupby") def pad(self, limit=None): @@ -2948,7 +2989,6 @@ def _get_cythonized_result( min_count: int | None = None, needs_mask: bool = False, needs_ngroups: bool = False, - result_is_index: bool = False, pre_processing=None, post_processing=None, fill_value=None, @@ -2985,9 +3025,6 @@ def _get_cythonized_result( needs_nullable : bool, default False Whether a bool specifying if the input is nullable is part of the Cython call signature - result_is_index : bool, default False - Whether the result of the Cython operation is an index of - values to be retrieved, instead of the actual values themselves pre_processing : function, default None Function to be applied to `values` prior to passing to Cython. Function should return a tuple where the first element is the @@ -3013,8 +3050,6 @@ def _get_cythonized_result( """ numeric_only = self._resolve_numeric_only(numeric_only) - if result_is_index and aggregate: - raise ValueError("'result_is_index' and 'aggregate' cannot both be True!") if post_processing and not callable(post_processing): raise ValueError("'post_processing' must be a callable!") if pre_processing: @@ -3086,14 +3121,9 @@ def blk_func(values: ArrayLike) -> ArrayLike: func(**kwargs) # Call func to modify indexer values in place - if result_is_index: - result = algorithms.take_nd(values, result, fill_value=fill_value) - if real_2d and values.ndim == 1: assert result.shape[1] == 1, result.shape - # error: No overload variant of "__getitem__" of "ExtensionArray" - # matches argument type "Tuple[slice, int]" - result = result[:, 0] # type: ignore[call-overload] + result = result[:, 0] if needs_mask: mask = mask[:, 0] diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py index f3149abb52291..525bba984fca5 100644 --- a/pandas/tests/groupby/test_missing.py +++ b/pandas/tests/groupby/test_missing.py @@ -130,6 +130,8 @@ def test_ffill_handles_nan_groups(dropna, method, has_nan_group): ridx = expected_rows.get((method, dropna, has_nan_group)) expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True) + # columns are a 'take' on df.columns, which are object dtype + expected.columns = expected.columns.astype(object) tm.assert_frame_equal(result, expected)