diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f19b0fe10fe6e..5011eb769d73b 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -419,6 +419,7 @@ Groupby/resample/rolling - Bug in :meth:`Series.rolling` when the :class:`Series` ``dtype`` was ``Int64`` (:issue:`43016`) - Bug in :meth:`DataFrame.rolling.corr` when the :class:`DataFrame` columns was a :class:`MultiIndex` (:issue:`21157`) - Bug in :meth:`DataFrame.groupby.rolling` when specifying ``on`` and calling ``__getitem__`` would subsequently return incorrect results (:issue:`43355`) +- Bug in :meth:`GroupBy.apply` with time-based :class:`Grouper` objects incorrectly raising ``ValueError`` in corner cases where the grouping vector contains a ``NaT`` (:issue:`43500`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1cbfcff5e94f1..fa0bc5fd38de7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -400,7 +400,7 @@ def _wrap_applied_output( if isinstance(values[0], dict): # GH #823 #24880 - index = self._group_keys_index + index = self.grouper.result_index res_df = self.obj._constructor_expanddim(values, index=index) res_df = self._reindex_output(res_df) # if self.observed is False, @@ -413,7 +413,7 @@ def _wrap_applied_output( else: # GH #6265 #24880 result = self.obj._constructor( - data=values, index=self._group_keys_index, name=self.obj.name + data=values, index=self.grouper.result_index, name=self.obj.name ) return self._reindex_output(result) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index d989cde09380a..339bb2c30736d 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1183,18 +1183,14 @@ def df_cat(df): return df_cat -@pytest.mark.parametrize( - "operation, kwargs", [("agg", {"dtype": "category"}), ("apply", {})] -) -def test_seriesgroupby_observed_true(df_cat, operation, kwargs): +@pytest.mark.parametrize("operation", ["agg", "apply"]) +def test_seriesgroupby_observed_true(df_cat, operation): # GH 24880 - index = MultiIndex.from_frame( - DataFrame( - {"A": ["foo", "foo", "bar", "bar"], "B": ["one", "two", "one", "three"]}, - **kwargs, - ) - ) + lev_a = Index(["foo", "foo", "bar", "bar"], dtype=df_cat["A"].dtype, name="A") + lev_b = Index(["one", "two", "one", "three"], dtype=df_cat["B"].dtype, name="B") + index = MultiIndex.from_arrays([lev_a, lev_b]) expected = Series(data=[1, 3, 2, 4], index=index, name="C") + grouped = df_cat.groupby(["A", "B"], observed=True)["C"] result = getattr(grouped, operation)(sum) tm.assert_series_equal(result, expected) @@ -1225,18 +1221,16 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): [ ( True, - MultiIndex.from_tuples( + MultiIndex.from_arrays( [ - ("foo", "one", "min"), - ("foo", "one", "max"), - ("foo", "two", "min"), - ("foo", "two", "max"), - ("bar", "one", "min"), - ("bar", "one", "max"), - ("bar", "three", "min"), - ("bar", "three", "max"), - ], - names=["A", "B", None], + Index(["foo"] * 4 + ["bar"] * 4, dtype="category", name="A"), + Index( + ["one", "one", "two", "two", "one", "one", "three", "three"], + dtype="category", + name="B", + ), + Index(["min", "max"] * 4), + ] ), [1, 1, 3, 3, 2, 2, 4, 4], ), diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index a89aabc3763f1..f40fb8cba3435 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -23,6 +23,41 @@ from pandas.core.groupby.ops import BinGrouper +@pytest.fixture +def groupby_with_truncated_bingrouper(): + """ + GroupBy object such that gb.grouper is a BinGrouper and + len(gb.grouper.result_index) < len(gb.grouper.group_keys_seq) + + Aggregations on this groupby should have + + dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date") + + As either the index or an index level. + """ + df = DataFrame( + { + "Quantity": [18, 3, 5, 1, 9, 3], + "Date": [ + Timestamp(2013, 9, 1, 13, 0), + Timestamp(2013, 9, 1, 13, 5), + Timestamp(2013, 10, 1, 20, 0), + Timestamp(2013, 10, 3, 10, 0), + pd.NaT, + Timestamp(2013, 9, 2, 14, 0), + ], + } + ) + + tdg = Grouper(key="Date", freq="5D") + gb = df.groupby(tdg) + + # check we're testing the case we're interested in + assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq) + + return gb + + class TestGroupBy: def test_groupby_with_timegrouper(self): # GH 4161 @@ -779,3 +814,36 @@ def test_grouper_period_index(self): range(0, periods), index=Index(range(1, periods + 1), name=index.name) ) tm.assert_series_equal(result, expected) + + def test_groupby_apply_timegrouper_with_nat_dict_returns( + self, groupby_with_truncated_bingrouper + ): + # GH#43500 case where gb.grouper.result_index and gb.grouper.group_keys_seq + # have different lengths that goes through the `isinstance(values[0], dict)` + # path + gb = groupby_with_truncated_bingrouper + + res = gb["Quantity"].apply(lambda x: {"foo": len(x)}) + + dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date") + mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)]) + expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity") + tm.assert_series_equal(res, expected) + + def test_groupby_apply_timegrouper_with_nat_scalar_returns( + self, groupby_with_truncated_bingrouper + ): + # GH#43500 Previously raised ValueError bc used index with incorrect + # length in wrap_applied_result + gb = groupby_with_truncated_bingrouper + + res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan) + + dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date") + expected = Series( + [18, np.nan, np.nan, np.nan, np.nan, np.nan, 5], + index=dti._with_freq(None), + name="Quantity", + ) + + tm.assert_series_equal(res, expected)