diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 1336fd7d83f7e..b422344f1cd6d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -443,6 +443,7 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.sum()` returned wrong values when dtypes where mixed between float and integer and axis was equal to one (:issue:`20649`, :issue:`35596`) - Bug in :meth:`Rolling.count` returned ``np.nan`` with :class:`pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in window (:issue:`35579`) - Bug where :class:`pandas.core.window.Rolling` produces incorrect window sizes when using a ``PeriodIndex`` (:issue:`34225`) +- Bug in :meth:`DataFrameGroupBy.ffill` and :meth:`DataFrameGroupBy.bfill` where a ``NaN`` group would return filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`) - Bug in :meth:`RollingGroupby.count` where a ``ValueError`` was raised when specifying the ``closed`` parameter (:issue:`35869`) Reshaping diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index a83634aad3ce2..5a958d5e0bd3c 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -344,7 +344,7 @@ def group_shift_indexer(int64_t[:] out, const int64_t[:] labels, @cython.boundscheck(False) def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, ndarray[uint8_t] mask, object direction, - int64_t limit): + int64_t limit, bint dropna): """ Indexes how to fill values forwards or backwards within a group. @@ -358,6 +358,7 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, direction : {'ffill', 'bfill'} Direction for fill to be applied (forwards or backwards, respectively) limit : Consecutive values to fill before stopping, or -1 for no limit + dropna : Flag to indicate if NaN groups should return all NaN values Notes ----- @@ -381,7 +382,9 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, with nogil: for i in range(N): idx = sorted_labels[i] - if mask[idx] == 1: # is missing + if dropna and labels[idx] == -1: # nan-group gets nan-values + curr_fill_idx = -1 + elif mask[idx] == 1: # is missing # Stop filling once we've hit the limit if filled_vals >= limit and limit != -1: curr_fill_idx = -1 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 54d52b1e79da3..c758844da3a2b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1866,6 +1866,7 @@ def _fill(self, direction, limit=None): result_is_index=True, direction=direction, limit=limit, + dropna=self.dropna, ) @Substitution(name="groupby") diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py index 116aed9935694..70d8dfc20822a 100644 --- a/pandas/tests/groupby/test_missing.py +++ b/pandas/tests/groupby/test_missing.py @@ -82,3 +82,37 @@ def test_fill_consistency(): expected = df.groupby(level=0, axis=0).fillna(method="ffill") result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("method", ["ffill", "bfill"]) +@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize("has_nan_group", [True, False]) +def test_ffill_handles_nan_groups(dropna, method, has_nan_group): + # GH 34725 + + df_without_nan_rows = pd.DataFrame([(1, 0.1), (2, 0.2)]) + + ridx = [-1, 0, -1, -1, 1, -1] + df = df_without_nan_rows.reindex(ridx).reset_index(drop=True) + + group_b = np.nan if has_nan_group else "b" + df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3) + + grouped = df.groupby(by="group_col", dropna=dropna) + result = getattr(grouped, method)(limit=None) + + expected_rows = { + ("ffill", True, True): [-1, 0, 0, -1, -1, -1], + ("ffill", True, False): [-1, 0, 0, -1, 1, 1], + ("ffill", False, True): [-1, 0, 0, -1, 1, 1], + ("ffill", False, False): [-1, 0, 0, -1, 1, 1], + ("bfill", True, True): [0, 0, -1, -1, -1, -1], + ("bfill", True, False): [0, 0, -1, 1, 1, -1], + ("bfill", False, True): [0, 0, -1, 1, 1, -1], + ("bfill", False, False): [0, 0, -1, 1, 1, -1], + } + + ridx = expected_rows.get((method, dropna, has_nan_group)) + expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True) + + tm.assert_frame_equal(result, expected)