From 0fa2104fbd39a8b18853f2a7023fa6c434e8f19f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 25 Jun 2020 10:07:29 -0500 Subject: [PATCH 01/67] API: User-control of result keys --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/frame.py | 36 ++++++++++++++++++++++++++++++ pandas/core/generic.py | 9 ++++++++ pandas/core/groupby/groupby.py | 10 ++++++++- pandas/core/series.py | 2 ++ pandas/tests/groupby/test_apply.py | 27 ++++++++++++++++++++++ 6 files changed, 84 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 6808737d4fa5e..860f0d4308717 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -307,6 +307,7 @@ Other enhancements - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). +- Added a ``result_group_keys`` keyword to :meth:`~DataFrame.groupby` to control the result index when using ``apply`` (:issue:`34809`) - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). - :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`) - :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 521d16ac0b905..381dca48153ad 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6414,6 +6414,40 @@ def update( a 13.0 13.0 b 12.3 123.0 NaN 12.3 33.0 + +By default, the group keys are not prepended to the index for transforms +passed to ``apply``. + +>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', +... 'Parrot', 'Parrot'], +... 'Max Speed': [380., 370., 24., 26.]}) +>>> df.groupby("Animal").apply(lambda x: x) + Animal Max Speed +0 Falcon 380.0 +1 Falcon 370.0 +2 Parrot 24.0 +3 Parrot 26.0 + +But they are prepended for user-defined functions that return an object +with different row or column labels. + +>>> df.groupby("Animal").apply(lambda x: x.rename(index=np.exp)) + Animal Max Speed +Animal +Falcon 1.000000 Falcon 380.0 + 2.718282 Falcon 370.0 +Parrot 7.389056 Parrot 24.0 + 20.085537 Parrot 26.0 + +To control this behavior, specify ``result_group_keys`` + +>>> df.groupby("Animal", result_group_keys=True).apply(lambda x: x) + Animal Max Speed +Animal +Falcon 0 Falcon 380.0 + 1 Falcon 370.0 +Parrot 2 Parrot 24.0 + 3 Parrot 26.0 """ ) @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) @@ -6428,6 +6462,7 @@ def groupby( squeeze: bool = no_default, observed: bool = False, dropna: bool = True, + result_group_keys: Optional[bool] = None, ) -> "DataFrameGroupBy": from pandas.core.groupby.generic import DataFrameGroupBy @@ -6458,6 +6493,7 @@ def groupby( squeeze=squeeze, observed=observed, dropna=dropna, + result_group_keys=result_group_keys, ) _shared_docs[ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 307bf84068424..941fbd6a8e5db 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7402,6 +7402,15 @@ def clip( with row/column will be dropped. If False, NA values will also be treated as the key in groups + .. versionadded:: 1.1.0 + result_group_keys : bool, optional + Whether to prepend the group keys to the result's index when + using ``apply``. By default, this depends on whether the result + of the user-defined function passed to apply is a *transformation* + (which returns a like-indexed object) or not (returns an object + with a different index or column names). Transforms do not get + the group keys prepended as an index level. Other functions do. + .. versionadded:: 1.1.0 Returns diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d039b715b3c08..9bc0f72b4458f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -493,6 +493,7 @@ def __init__( observed: bool = False, mutated: bool = False, dropna: bool = True, + result_group_keys: Optional[bool] = None, ): self._selection = selection @@ -515,6 +516,7 @@ def __init__( self.observed = observed self.mutated = mutated self.dropna = dropna + self.result_group_keys = result_group_keys if grouper is None: from pandas.core.groupby.grouper import get_grouper @@ -887,9 +889,15 @@ def _python_apply_general( data after applying f """ keys, values, mutated = self.grouper.apply(f, data, self.axis) + if self.result_group_keys is None: + # infer by default + not_indexed_same = mutated or self.mutated + else: + # but defer to user-specified value + not_indexed_same = bool(self.result_group_keys) return self._wrap_applied_output( - keys, values, not_indexed_same=mutated or self.mutated + keys, values, not_indexed_same=not_indexed_same ) def _iterate_slices(self) -> Iterable[Series]: diff --git a/pandas/core/series.py b/pandas/core/series.py index a652af5efc590..3da62f01183df 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1696,6 +1696,7 @@ def groupby( squeeze: bool = no_default, observed: bool = False, dropna: bool = True, + result_group_keys: Optional[bool] = None, ) -> "SeriesGroupBy": from pandas.core.groupby.generic import SeriesGroupBy @@ -1726,6 +1727,7 @@ def groupby( squeeze=squeeze, observed=observed, dropna=dropna, + result_group_keys=result_group_keys, ) # ---------------------------------------------------------------------- diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 1945647ced08f..4b8adee09e54e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -995,3 +995,30 @@ def test_apply_function_with_indexing_return_column(): result = df.groupby("foo1", as_index=False).apply(lambda x: x.mean()) expected = DataFrame({"foo1": ["one", "three", "two"], "foo2": [3.0, 4.0, 4.0]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "udf, is_transform", + [(lambda x: x.copy(), True), (lambda x: x.copy().rename(lambda y: y + 1), False)], +) +@pytest.mark.parametrize("result_group_keys", [True, False, None]) +def test_apply_result_type(result_group_keys, udf, is_transform): + # https://github.com/pandas-dev/pandas/issues/34809 + # We'd like to control whether the group keys end up in the index + # regardless of whether the UDF happens to be a transform. + df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]}) + df_result = df.groupby("A", result_group_keys=result_group_keys).apply(udf) + series_result = df.B.groupby(df.A, result_group_keys=result_group_keys).apply(udf) + + if result_group_keys: + assert df_result.index.nlevels == 2 + assert series_result.index.nlevels == 2 + elif result_group_keys is False: + assert df_result.index.nlevels == 1 + assert series_result.index.nlevels == 1 + elif is_transform: + assert df_result.index.nlevels == 1 + assert series_result.index.nlevels == 1 + else: + assert df_result.index.nlevels == 2 + assert series_result.index.nlevels == 2 From 13a38a2561dae5ccf93b63518f8817e39366d0c8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 26 Jun 2020 08:30:34 -0500 Subject: [PATCH 02/67] wip --- pandas/core/groupby/groupby.py | 5 +++-- pandas/tests/groupby/test_apply.py | 24 ++++++++++++------------ 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9bc0f72b4458f..0fd5adeba6d25 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -889,12 +889,13 @@ def _python_apply_general( data after applying f """ keys, values, mutated = self.grouper.apply(f, data, self.axis) - if self.result_group_keys is None: + breakpoint() + if self.group_keys is None: # infer by default not_indexed_same = mutated or self.mutated else: # but defer to user-specified value - not_indexed_same = bool(self.result_group_keys) + not_indexed_same = bool(self.group_keys) return self._wrap_applied_output( keys, values, not_indexed_same=not_indexed_same diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 4b8adee09e54e..5ef0c22c3667e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1001,24 +1001,24 @@ def test_apply_function_with_indexing_return_column(): "udf, is_transform", [(lambda x: x.copy(), True), (lambda x: x.copy().rename(lambda y: y + 1), False)], ) -@pytest.mark.parametrize("result_group_keys", [True, False, None]) -def test_apply_result_type(result_group_keys, udf, is_transform): +@pytest.mark.parametrize("group_keys", [True, False]) +def test_apply_result_type(group_keys, udf, is_transform): # https://github.com/pandas-dev/pandas/issues/34809 # We'd like to control whether the group keys end up in the index # regardless of whether the UDF happens to be a transform. df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]}) - df_result = df.groupby("A", result_group_keys=result_group_keys).apply(udf) - series_result = df.B.groupby(df.A, result_group_keys=result_group_keys).apply(udf) + df_result = df.groupby("A", group_keys=group_keys).apply(udf) + series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf) - if result_group_keys: + if group_keys: assert df_result.index.nlevels == 2 assert series_result.index.nlevels == 2 - elif result_group_keys is False: + elif group_keys is False: assert df_result.index.nlevels == 1 assert series_result.index.nlevels == 1 - elif is_transform: - assert df_result.index.nlevels == 1 - assert series_result.index.nlevels == 1 - else: - assert df_result.index.nlevels == 2 - assert series_result.index.nlevels == 2 + # elif is_transform: + # assert df_result.index.nlevels == 1 + # assert series_result.index.nlevels == 1 + # else: + # assert df_result.index.nlevels == 2 + # assert series_result.index.nlevels == 2 From f8d646f7a353087a012266b7451c73a9f8ca317f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 26 Jun 2020 08:40:01 -0500 Subject: [PATCH 03/67] mmm --- pandas/core/groupby/generic.py | 9 ++++++++- pandas/core/groupby/groupby.py | 1 - pandas/tests/groupby/transform/test_transform.py | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dab8475d9580c..8eb0ed2aff4d2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1672,12 +1672,19 @@ def _gotitem(self, key, ndim: int, subset=None): exclusions=self.exclusions, as_index=self.as_index, observed=self.observed, + group_keys=self.group_keys, + # TODO: dropna? ) elif ndim == 1: if subset is None: subset = self.obj[key] return SeriesGroupBy( - subset, selection=key, grouper=self.grouper, observed=self.observed + subset, + selection=key, + grouper=self.grouper, + observed=self.observed, + group_keys=self.group_keys + # TODO: dropna, as_index? ) raise AssertionError("invalid ndim for _gotitem") diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0fd5adeba6d25..93fce50cd862e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -889,7 +889,6 @@ def _python_apply_general( data after applying f """ keys, values, mutated = self.grouper.apply(f, data, self.axis) - breakpoint() if self.group_keys is None: # infer by default not_indexed_same = mutated or self.mutated diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index cdaf27e214d80..8d6a2bfd88a03 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -727,7 +727,7 @@ def test_cython_transform_frame(op, args, targop): ]: # dict(by='string_missing')]: # dict(by=['int','string'])]: - gb = df.groupby(**gb_target) + gb = df.groupby(group_keys=False, **gb_target) # allowlisted methods set the selection before applying # bit a of hack to make sure the cythonized shift # is equivalent to pre 0.17.1 behavior From 623526ced44935e2bae6cfee8f0b41e941ecce20 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 26 Jun 2020 08:55:03 -0500 Subject: [PATCH 04/67] updates --- pandas/core/groupby/groupby.py | 6 ++++++ pandas/tests/groupby/test_apply.py | 31 +++++++++++++----------------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 93fce50cd862e..bcf1034c702f8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -889,6 +889,12 @@ def _python_apply_general( data after applying f """ keys, values, mutated = self.grouper.apply(f, data, self.axis) + # current thought: make `group_keys=None` by default and + # warn if we get here. The default falls back to the mutated + # stuff. + # *but* we might have to treat that specially? + # see test_apply_chunk_view. + # breakpoint() if self.group_keys is None: # infer by default not_indexed_same = mutated or self.mutated diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5ef0c22c3667e..803a9ed42fadd 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -63,12 +63,7 @@ def test_apply_trivial(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail( - reason="GH#20066; function passed into apply " - "returns a DataFrame with the same index " - "as the one to create GroupBy object." -) -def test_apply_trivial_fail(): +def test_apply_trivial2(): # GH 20066 # trivial apply fails if the constant dataframe has the same index # with the one used to create GroupBy object. @@ -247,7 +242,7 @@ def test_groupby_apply_identity_maybecopy_index_identical(func): df = pd.DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - result = df.groupby("g").apply(func) + result = df.groupby("g", group_keys=False).apply(func) tm.assert_frame_equal(result, df) @@ -304,7 +299,7 @@ def test_groupby_as_index_apply(df): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) - res = df.groupby(0, as_index=False).apply(lambda x: x).index + res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) @@ -354,7 +349,7 @@ def f(piece): dr = bdate_range("1/1/2000", periods=100) ts = Series(np.random.randn(100), index=dr) - grouped = ts.groupby(lambda x: x.month) + grouped = ts.groupby(lambda x: x.month, group_keys=False) result = grouped.apply(f) assert isinstance(result, DataFrame) @@ -408,7 +403,7 @@ def trans2(group): def test_apply_transform(ts): - grouped = ts.groupby(lambda x: x.month) + grouped = ts.groupby(lambda x: x.month, group_keys=False) result = grouped.apply(lambda x: x * 2) expected = grouped.transform(lambda x: x * 2) tm.assert_series_equal(result, expected) @@ -462,7 +457,7 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby("d").apply(f) + result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -486,7 +481,7 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby("d").apply(f) + result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -495,7 +490,7 @@ def f(group): def test_apply_corner(tsframe): - result = tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2) + result = tsframe.groupby(lambda x: x.year, group_keys=False).apply(lambda x: x * 2) expected = tsframe * 2 tm.assert_frame_equal(result, expected) @@ -537,14 +532,14 @@ def test_apply_with_duplicated_non_sorted_axis(test_series): ) if test_series: ser = df.set_index("Y")["X"] - result = ser.groupby(level=0).apply(lambda x: x) + result = ser.groupby(level=0, group_keys=False).apply(lambda x: x) # not expecting the order to remain the same for duplicated axis result = result.sort_index() expected = ser.sort_index() tm.assert_series_equal(result, expected) else: - result = df.groupby("Y").apply(lambda x: x) + result = df.groupby("Y", group_keys=False).apply(lambda x: x) # not expecting the order to remain the same for duplicated axis result = result.sort_values("Y") @@ -568,7 +563,7 @@ def reindex_helper(x): return x.reindex(np.arange(x.index.min(), x.index.max() + 1)) # the following group by raised a ValueError - result = df.groupby("group").value.apply(reindex_helper) + result = df.groupby("group", group_keys=False).value.apply(reindex_helper) tm.assert_series_equal(expected, result) @@ -781,7 +776,7 @@ def test_groupby_apply_return_empty_chunk(): def test_apply_with_mixed_types(): # gh-20949 df = pd.DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]}) - g = df.groupby("A") + g = df.groupby("A", group_keys=False) result = g.transform(lambda x: x / x.sum()) expected = pd.DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]}) @@ -907,7 +902,7 @@ def test_groupby_apply_datetime_result_dtypes(): def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) - result = df.groupby("group").apply(lambda x: x) + result = df.groupby("group", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) From 6871ed0cc334da6c2ad697d6a077d386b7dc64e7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 26 Jun 2020 09:26:53 -0500 Subject: [PATCH 05/67] fixups --- doc/source/whatsnew/v1.1.0.rst | 1 - pandas/core/frame.py | 6 +-- pandas/core/generic.py | 9 ---- pandas/core/groupby/generic.py | 2 + pandas/core/groupby/groupby.py | 45 ++++++++----------- pandas/core/series.py | 2 - pandas/tests/groupby/test_apply.py | 10 ++++- pandas/tests/groupby/test_function.py | 32 ++++++++----- pandas/tests/groupby/test_groupby.py | 12 ++--- pandas/tests/groupby/test_timegrouper.py | 2 +- .../tests/groupby/transform/test_transform.py | 26 +++++++---- 11 files changed, 78 insertions(+), 69 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 62882bbc7cbb8..75f406d908c73 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -308,7 +308,6 @@ Other enhancements - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). -- Added a ``result_group_keys`` keyword to :meth:`~DataFrame.groupby` to control the result index when using ``apply`` (:issue:`34809`) - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). - :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`) - :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 381dca48153ad..08862770e40b8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6439,9 +6439,9 @@ def update( Parrot 7.389056 Parrot 24.0 20.085537 Parrot 26.0 -To control this behavior, specify ``result_group_keys`` +To control this behavior, specify ``group_keys`` ->>> df.groupby("Animal", result_group_keys=True).apply(lambda x: x) +>>> df.groupby("Animal", group_keys=True).apply(lambda x: x) Animal Max Speed Animal Falcon 0 Falcon 380.0 @@ -6462,7 +6462,6 @@ def groupby( squeeze: bool = no_default, observed: bool = False, dropna: bool = True, - result_group_keys: Optional[bool] = None, ) -> "DataFrameGroupBy": from pandas.core.groupby.generic import DataFrameGroupBy @@ -6493,7 +6492,6 @@ def groupby( squeeze=squeeze, observed=observed, dropna=dropna, - result_group_keys=result_group_keys, ) _shared_docs[ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 82a0e8a48e141..4e0247bfcddca 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7406,15 +7406,6 @@ def clip( with row/column will be dropped. If False, NA values will also be treated as the key in groups - .. versionadded:: 1.1.0 - result_group_keys : bool, optional - Whether to prepend the group keys to the result's index when - using ``apply``. By default, this depends on whether the result - of the user-defined function passed to apply is a *transformation* - (which returns a like-indexed object) or not (returns an object - with a different index or column names). Transforms do not get - the group keys prepended as an index level. Other functions do. - .. versionadded:: 1.1.0 Returns diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8eb0ed2aff4d2..7b90ffbcc1bb0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -414,6 +414,7 @@ def _wrap_transformed_output( return result def _wrap_applied_output(self, keys, values, not_indexed_same=False): + # breakpoint() if len(keys) == 0: # GH #6265 return self.obj._constructor( @@ -1204,6 +1205,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: return self.obj._constructor(result, columns=result_columns) def _wrap_applied_output(self, keys, values, not_indexed_same=False): + # breakpoint() if len(keys) == 0: return self.obj._constructor(index=keys) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bcf1034c702f8..887061ccd160c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -493,7 +493,6 @@ def __init__( observed: bool = False, mutated: bool = False, dropna: bool = True, - result_group_keys: Optional[bool] = None, ): self._selection = selection @@ -516,7 +515,6 @@ def __init__( self.observed = observed self.mutated = mutated self.dropna = dropna - self.result_group_keys = result_group_keys if grouper is None: from pandas.core.groupby.grouper import get_grouper @@ -894,13 +892,7 @@ def _python_apply_general( # stuff. # *but* we might have to treat that specially? # see test_apply_chunk_view. - # breakpoint() - if self.group_keys is None: - # infer by default - not_indexed_same = mutated or self.mutated - else: - # but defer to user-specified value - not_indexed_same = bool(self.group_keys) + not_indexed_same = mutated or self.mutated return self._wrap_applied_output( keys, values, not_indexed_same=not_indexed_same @@ -1125,23 +1117,7 @@ def reset_identity(values): ax._reset_identity() return values - if not not_indexed_same: - result = concat(values, axis=self.axis) - ax = self._selected_obj._get_axis(self.axis) - - # this is a very unfortunate situation - # we can't use reindex to restore the original order - # when the ax has duplicates - # so we resort to this - # GH 14776, 30667 - if ax.has_duplicates: - indexer, _ = result.index.get_indexer_non_unique(ax.values) - indexer = algorithms.unique1d(indexer) - result = result.take(indexer, axis=self.axis) - else: - result = result.reindex(ax, axis=self.axis) - - elif self.group_keys: + if self.group_keys: values = reset_identity(values) if self.as_index: @@ -1165,6 +1141,23 @@ def reset_identity(values): # range index keys = list(range(len(values))) result = concat(values, axis=self.axis, keys=keys) + + elif not not_indexed_same: + result = concat(values, axis=self.axis) + ax = self._selected_obj._get_axis(self.axis) + + # this is a very unfortunate situation + # we can't use reindex to restore the original order + # when the ax has duplicates + # so we resort to this + # GH 14776, 30667 + if ax.has_duplicates: + indexer, _ = result.index.get_indexer_non_unique(ax.values) + indexer = algorithms.unique1d(indexer) + result = result.take(indexer, axis=self.axis) + else: + result = result.reindex(ax, axis=self.axis) + else: values = reset_identity(values) result = concat(values, axis=self.axis) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3da62f01183df..a652af5efc590 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1696,7 +1696,6 @@ def groupby( squeeze: bool = no_default, observed: bool = False, dropna: bool = True, - result_group_keys: Optional[bool] = None, ) -> "SeriesGroupBy": from pandas.core.groupby.generic import SeriesGroupBy @@ -1727,7 +1726,6 @@ def groupby( squeeze=squeeze, observed=observed, dropna=dropna, - result_group_keys=result_group_keys, ) # ---------------------------------------------------------------------- diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 803a9ed42fadd..86e0c4ac40699 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -420,12 +420,18 @@ def f(group): tm.assert_frame_equal(result.loc[key], f(group)) -def test_apply_chunk_view(): +@pytest.mark.parametrize("group_keys", [True, False]) +def test_apply_chunk_view(group_keys): # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result = df.groupby("key", group_keys=False).apply(lambda x: x[:2]) + result = df.groupby("key", group_keys=group_keys).apply(lambda x: x[:2]) expected = df.take([0, 1, 3, 4, 6, 7]) + if group_keys: + expected.index = pd.MultiIndex.from_arrays( + [[1, 1, 2, 2, 3, 3], expected.index], names=["key", None] + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 6f19ec40c2520..8b928a15c4f6e 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -341,10 +341,10 @@ def test_cython_api2(): tm.assert_frame_equal(result, expected) # GH 13994 - result = df.groupby("A").cumsum(axis=1) + result = df.groupby("A", group_keys=False).cumsum(axis=1) expected = df.cumsum(axis=1) tm.assert_frame_equal(result, expected) - result = df.groupby("A").cumprod(axis=1) + result = df.groupby("A", group_keys=False).cumprod(axis=1) expected = df.cumprod(axis=1) tm.assert_frame_equal(result, expected) @@ -527,7 +527,7 @@ def test_groupby_cumprod(): df = pd.DataFrame({"key": ["b"] * 10, "value": 2}) actual = df.groupby("key")["value"].cumprod() - expected = df.groupby("key")["value"].apply(lambda x: x.cumprod()) + expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) expected.name = "value" tm.assert_series_equal(actual, expected) @@ -536,7 +536,7 @@ def test_groupby_cumprod(): # if overflows, groupby product casts to float # while numpy passes back invalid values df["value"] = df["value"].astype(float) - expected = df.groupby("key")["value"].apply(lambda x: x.cumprod()) + expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) expected.name = "value" tm.assert_series_equal(actual, expected) @@ -701,7 +701,7 @@ def test_cummin(numpy_dtypes_for_minmax): expected = pd.DataFrame({"B": expected_mins}).astype(dtype) result = df.groupby("A").cummin() tm.assert_frame_equal(result, expected) - result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() + result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(result, expected) # Test w/ min value for dtype @@ -709,7 +709,9 @@ def test_cummin(numpy_dtypes_for_minmax): expected.loc[[2, 3, 6, 7], "B"] = min_val result = df.groupby("A").cummin() tm.assert_frame_equal(result, expected) - expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() + expected = ( + df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() + ) tm.assert_frame_equal(result, expected) # Test nan in some values @@ -717,7 +719,9 @@ def test_cummin(numpy_dtypes_for_minmax): expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) result = base_df.groupby("A").cummin() tm.assert_frame_equal(result, expected) - expected = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() + expected = ( + base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() + ) tm.assert_frame_equal(result, expected) # GH 15561 @@ -740,7 +744,9 @@ def test_cummin_all_nan_column(): expected = pd.DataFrame({"B": [np.nan] * 8}) result = base_df.groupby("A").cummin() tm.assert_frame_equal(expected, result) - result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() + result = ( + base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() + ) tm.assert_frame_equal(expected, result) @@ -759,7 +765,7 @@ def test_cummax(numpy_dtypes_for_minmax): expected = pd.DataFrame({"B": expected_maxs}).astype(dtype) result = df.groupby("A").cummax() tm.assert_frame_equal(result, expected) - result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() + result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(result, expected) # Test w/ max value for dtype @@ -767,7 +773,9 @@ def test_cummax(numpy_dtypes_for_minmax): expected.loc[[2, 3, 6, 7], "B"] = max_val result = df.groupby("A").cummax() tm.assert_frame_equal(result, expected) - expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() + expected = ( + df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() + ) tm.assert_frame_equal(result, expected) # Test nan in some values @@ -775,7 +783,9 @@ def test_cummax(numpy_dtypes_for_minmax): expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) result = base_df.groupby("A").cummax() tm.assert_frame_equal(result, expected) - expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() + expected = ( + base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() + ) tm.assert_frame_equal(result, expected) # GH 15561 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0d040b8e6955a..76dc7bc66119e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -30,7 +30,7 @@ def test_basic(dtype): np.random.shuffle(index) data = data.reindex(index) - grouped = data.groupby(lambda x: x // 3) + grouped = data.groupby(lambda x: x // 3, group_keys=False) for k, v in grouped: assert len(v) == 3 @@ -1345,7 +1345,7 @@ def test_dont_clobber_name_column(): {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} ) - result = df.groupby("key").apply(lambda x: x) + result = df.groupby("key", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -1567,13 +1567,15 @@ def test_groupby_multiindex_not_lexsorted(): for level in [0, 1, [0, 1]]: for sort in [False, True]: - result = df.groupby(level=level, sort=sort).apply(DataFrame.drop_duplicates) + result = df.groupby(level=level, sort=sort, group_keys=False).apply( + DataFrame.drop_duplicates + ) expected = df tm.assert_frame_equal(expected, result) result = ( df.sort_index() - .groupby(level=level, sort=sort) + .groupby(level=level, sort=sort, group_keys=False) .apply(DataFrame.drop_duplicates) ) expected = df.sort_index() @@ -1692,7 +1694,7 @@ def test_group_shift_with_fill_value(): columns=["A", "B", "Z"], index=None, ) - g = df.groupby(["A", "B"]) + g = df.groupby(["A", "B"], group_keys=False) expected = DataFrame( [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 84fd7a1bdfb05..d410648a2d748 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -547,7 +547,7 @@ def test_groupby_multi_timezone(self): 4,2000-01-01 16:50:00,America/New_York""" df = pd.read_csv(StringIO(data), header=None, names=["value", "date", "tz"]) - result = df.groupby("tz").date.apply( + result = df.groupby("tz", group_keys=False).date.apply( lambda x: pd.to_datetime(x).dt.tz_localize(x.name) ) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 8d6a2bfd88a03..51039f11f23dc 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -64,8 +64,18 @@ def demean(arr): index=["Joe", "Steve", "Wes", "Jim", "Travis"], ) key = ["one", "two", "one", "two", "one"] - result = people.groupby(key).transform(demean).groupby(key).mean() - expected = people.groupby(key).apply(demean).groupby(key).mean() + result = ( + people.groupby(key, group_keys=False) + .transform(demean) + .groupby(key, group_keys=False) + .mean() + ) + expected = ( + people.groupby(key, group_keys=False) + .apply(demean) + .groupby(key, group_keys=False) + .mean() + ) tm.assert_frame_equal(result, expected) # GH 8430 @@ -175,26 +185,26 @@ def test_transform_axis(tsframe): ) # monotonic ts = tso - grouped = ts.groupby(lambda x: x.weekday()) + grouped = ts.groupby(lambda x: x.weekday(), group_keys=False) result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: x - x.mean()) tm.assert_frame_equal(result, expected) ts = ts.T - grouped = ts.groupby(lambda x: x.weekday(), axis=1) + grouped = ts.groupby(lambda x: x.weekday(), axis=1, group_keys=False) result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) tm.assert_frame_equal(result, expected) # non-monotonic ts = tso.iloc[[1, 0] + list(range(2, len(base)))] - grouped = ts.groupby(lambda x: x.weekday()) + grouped = ts.groupby(lambda x: x.weekday(), group_keys=False) result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: x - x.mean()) tm.assert_frame_equal(result, expected) ts = ts.T - grouped = ts.groupby(lambda x: x.weekday(), axis=1) + grouped = ts.groupby(lambda x: x.weekday(), axis=1, group_keys=False) result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) tm.assert_frame_equal(result, expected) @@ -309,7 +319,7 @@ def test_transform_multiple(ts): def test_dispatch_transform(tsframe): df = tsframe[::5].reindex(tsframe.index) - grouped = df.groupby(lambda x: x.month) + grouped = df.groupby(lambda x: x.month, group_keys=False) filled = grouped.fillna(method="pad") fillit = lambda x: x.fillna(method="pad") @@ -343,7 +353,7 @@ def test_transform_transformation_func(transformation_func): test_op = lambda x: x.transform(transformation_func) mock_op = lambda x: getattr(x, transformation_func)() - result = test_op(df.groupby("A")) + result = test_op(df.groupby("A", group_keys=False)) groups = [df[["B"]].iloc[:4], df[["B"]].iloc[4:6], df[["B"]].iloc[6:]] expected = concat([mock_op(g) for g in groups]) From 00ce5dc5cde0cc6c55af075dba9d030c3726e4ad Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 29 Jun 2020 08:20:49 -0500 Subject: [PATCH 06/67] test fixups --- pandas/core/groupby/groupby.py | 1 - pandas/core/resample.py | 4 +++- pandas/tests/groupby/test_function.py | 4 +++- pandas/tests/resample/test_resample_api.py | 4 +++- pandas/tests/test_multilevel.py | 2 +- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 887061ccd160c..9c15a7b5cdcb6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -825,7 +825,6 @@ def __iter__(self): ) ) def apply(self, func, *args, **kwargs): - func = self._is_builtin_func(func) # this is needed so we don't try and wrap strings. If we could diff --git a/pandas/core/resample.py b/pandas/core/resample.py index bfdfc65723433..dfe13f384d8db 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -357,7 +357,9 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): obj = self._selected_obj - grouped = get_groupby(obj, by=None, grouper=grouper, axis=self.axis) + grouped = get_groupby( + obj, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys + ) try: if isinstance(obj, ABCDataFrame) and callable(how): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 8b928a15c4f6e..d6a4d918a7448 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -808,7 +808,9 @@ def test_cummax_all_nan_column(): expected = pd.DataFrame({"B": [np.nan] * 8}) result = base_df.groupby("A").cummax() tm.assert_frame_equal(expected, result) - result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() + result = ( + base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() + ) tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 73aa01cff84fa..681e4c46a602c 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -274,7 +274,9 @@ def test_apply_without_aggregation(): # both resample and groupby should work w/o aggregation r = test_series.resample("20min") - g = test_series.groupby(pd.Grouper(freq="20min")) + + r.group_keys = False # XXX: Add to .resample API? + g = test_series.groupby(pd.Grouper(freq="20min"), group_keys=False) for t in [g, r]: result = t.apply(lambda x: x) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 1ba73292dc0b4..f2b293e4e8b47 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -787,7 +787,7 @@ def test_groupby_transform(self): s = self.frame["A"] grouper = s.index.get_level_values(0) - grouped = s.groupby(grouper) + grouped = s.groupby(grouper, group_keys=False) applied = grouped.apply(lambda x: x * 2) expected = grouped.transform(lambda x: x * 2) From c0b1140937d0270632140bc076cdd836a1058b0d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 29 Jun 2020 08:25:15 -0500 Subject: [PATCH 07/67] update doctests --- pandas/core/frame.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7ecdd2bef4dfe..1f133e11d4be4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6416,32 +6416,18 @@ def update( b 12.3 123.0 NaN 12.3 33.0 -By default, the group keys are not prepended to the index for transforms -passed to ``apply``. +To exclude or include the group keys in the index, specify ``group_keys`` >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', ... 'Parrot', 'Parrot'], ... 'Max Speed': [380., 370., 24., 26.]}) ->>> df.groupby("Animal").apply(lambda x: x) +>>> df.groupby("Animal", group_keys=False).apply(lambda x: x) Animal Max Speed 0 Falcon 380.0 1 Falcon 370.0 2 Parrot 24.0 3 Parrot 26.0 -But they are prepended for user-defined functions that return an object -with different row or column labels. - ->>> df.groupby("Animal").apply(lambda x: x.rename(index=np.exp)) - Animal Max Speed -Animal -Falcon 1.000000 Falcon 380.0 - 2.718282 Falcon 370.0 -Parrot 7.389056 Parrot 24.0 - 20.085537 Parrot 26.0 - -To control this behavior, specify ``group_keys`` - >>> df.groupby("Animal", group_keys=True).apply(lambda x: x) Animal Max Speed Animal From cfd4d73680a7f3f6070d3aa329c06958b5f95881 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 29 Jun 2020 08:40:45 -0500 Subject: [PATCH 08/67] resample --- pandas/core/generic.py | 8 ++++++++ pandas/core/groupby/generic.py | 6 +----- pandas/core/groupby/groupby.py | 5 ----- pandas/core/resample.py | 24 +++++++++++++++------- pandas/tests/resample/test_resample_api.py | 8 +++++--- 5 files changed, 31 insertions(+), 20 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a66cade3b81b0..4d78bd424e28c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7692,6 +7692,7 @@ def resample( level=None, origin: Union[str, TimestampConvertibleTypes] = "start_day", offset: Optional[TimedeltaConvertibleTypes] = None, + group_keys: Optional[bool_t] = True, ) -> "Resampler": """ Resample time-series data. @@ -7761,6 +7762,12 @@ def resample( .. versionadded:: 1.1.0 + group_keys : bool, default True + Whether to include the group keys in the result index when performing + a ``.groupby().apply()`` to to the resampled object. + + .. versionadded:: 1.1.0 + Returns ------- Resampler object @@ -8077,6 +8084,7 @@ def resample( level=level, origin=origin, offset=offset, + group_keys=group_keys, ) def first(self: FrameOrSeries, offset) -> FrameOrSeries: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7b90ffbcc1bb0..bd419906a764b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -414,7 +414,6 @@ def _wrap_transformed_output( return result def _wrap_applied_output(self, keys, values, not_indexed_same=False): - # breakpoint() if len(keys) == 0: # GH #6265 return self.obj._constructor( @@ -1205,7 +1204,6 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: return self.obj._constructor(result, columns=result_columns) def _wrap_applied_output(self, keys, values, not_indexed_same=False): - # breakpoint() if len(keys) == 0: return self.obj._constructor(index=keys) @@ -1675,7 +1673,6 @@ def _gotitem(self, key, ndim: int, subset=None): as_index=self.as_index, observed=self.observed, group_keys=self.group_keys, - # TODO: dropna? ) elif ndim == 1: if subset is None: @@ -1685,8 +1682,7 @@ def _gotitem(self, key, ndim: int, subset=None): selection=key, grouper=self.grouper, observed=self.observed, - group_keys=self.group_keys - # TODO: dropna, as_index? + group_keys=self.group_keys, ) raise AssertionError("invalid ndim for _gotitem") diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9c15a7b5cdcb6..4e35686876231 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -886,11 +886,6 @@ def _python_apply_general( data after applying f """ keys, values, mutated = self.grouper.apply(f, data, self.axis) - # current thought: make `group_keys=None` by default and - # warn if we get here. The default falls back to the mutated - # stuff. - # *but* we might have to treat that specially? - # see test_apply_chunk_view. not_indexed_same = mutated or self.mutated return self._wrap_applied_output( diff --git a/pandas/core/resample.py b/pandas/core/resample.py index dfe13f384d8db..311406bc704cb 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -77,14 +77,14 @@ class Resampler(_GroupBy, ShallowMixin): "offset", ] - def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): + def __init__(self, obj, groupby=None, axis=0, kind=None, group_keys=True, **kwargs): self.groupby = groupby self.keys = None self.sort = True self.axis = axis self.kind = kind self.squeeze = False - self.group_keys = True + self.group_keys = group_keys self.as_index = True self.exclusions = set() self.binner = None @@ -284,6 +284,7 @@ def pipe(self, func, *args, **kwargs): ) def aggregate(self, func, *args, **kwargs): + assert not self.group_keys self._set_binner() result, how = self._aggregate(func, *args, **kwargs) if result is None: @@ -339,7 +340,9 @@ def _gotitem(self, key, ndim: int, subset=None): grouper = self.grouper if subset is None: subset = self.obj - grouped = get_groupby(subset, by=None, grouper=grouper, axis=self.axis) + grouped = get_groupby( + subset, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys + ) # try the key selection try: @@ -356,7 +359,6 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): grouper = self.grouper obj = self._selected_obj - grouped = get_groupby( obj, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys ) @@ -1327,6 +1329,7 @@ def __init__( base: Optional[int] = None, origin: Union[str, TimestampConvertibleTypes] = "start_day", offset: Optional[TimedeltaConvertibleTypes] = None, + group_keys: Optional[bool] = True, **kwargs, ): # Check for correctness of the keyword arguments which would @@ -1363,6 +1366,7 @@ def __init__( self.how = how self.fill_method = fill_method self.limit = limit + self.group_keys = group_keys if origin in ("epoch", "start", "start_day"): self.origin = origin @@ -1427,11 +1431,17 @@ def _get_resampler(self, obj, kind=None): ax = self.ax if isinstance(ax, DatetimeIndex): - return DatetimeIndexResampler(obj, groupby=self, kind=kind, axis=self.axis) + return DatetimeIndexResampler( + obj, groupby=self, kind=kind, axis=self.axis, group_keys=self.group_keys + ) elif isinstance(ax, PeriodIndex) or kind == "period": - return PeriodIndexResampler(obj, groupby=self, kind=kind, axis=self.axis) + return PeriodIndexResampler( + obj, groupby=self, kind=kind, axis=self.axis, group_keys=self.group_keys + ) elif isinstance(ax, TimedeltaIndex): - return TimedeltaIndexResampler(obj, groupby=self, axis=self.axis) + return TimedeltaIndexResampler( + obj, groupby=self, axis=self.axis, group_keys=self.group_keys + ) raise TypeError( "Only valid with DatetimeIndex, " diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 681e4c46a602c..425cb34495655 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -273,15 +273,17 @@ def test_fillna(): def test_apply_without_aggregation(): # both resample and groupby should work w/o aggregation - r = test_series.resample("20min") - - r.group_keys = False # XXX: Add to .resample API? + r = test_series.resample("20min", group_keys=False) g = test_series.groupby(pd.Grouper(freq="20min"), group_keys=False) for t in [g, r]: result = t.apply(lambda x: x) tm.assert_series_equal(result, test_series) + grouped = test_series.to_frame(name="foo").resample("20min", group_keys=False) + result = grouped["foo"].apply(lambda x: x) + tm.assert_series_equal(result, test_series.rename("foo")) + def test_agg_consistency(): From c05b1ea43de3321bfaa0775475f575c790993d1a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 29 Jun 2020 09:06:22 -0500 Subject: [PATCH 09/67] warning --- pandas/core/frame.py | 2 +- pandas/core/groupby/groupby.py | 18 ++++++++++++++++++ pandas/tests/groupby/test_apply.py | 21 ++++++++++++++------- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1f133e11d4be4..4ed0832d439f4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6445,7 +6445,7 @@ def groupby( level=None, as_index: bool = True, sort: bool = True, - group_keys: bool = True, + group_keys: Optional[bool] = None, squeeze: bool = no_default, observed: bool = False, dropna: bool = True, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4e35686876231..b8bdc2fdaddb0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -29,6 +29,7 @@ class providing the base-class of operations. TypeVar, Union, ) +import warnings import numpy as np @@ -888,6 +889,23 @@ def _python_apply_general( keys, values, mutated = self.grouper.apply(f, data, self.axis) not_indexed_same = mutated or self.mutated + if not not_indexed_same and self.group_keys is None: + if self._selection is None: + stacklevel = 3 + else: + stacklevel = 4 + msg = ( + "Not prepending group keys to the result index of " + "transform-like apply. In the future, the group keys " + "will be included in the index, regardless of whether " + "the applied function returns a like-indexed object.\n" + "To preserve the previous behavior, use\n\n\t" + ">>> .groupby(..., group_keys=False)\n\n" + "To adopt the future behavior and silence this warning, use " + "\n\n\t>>> .groupby(..., group_keys=True)" + ) + warnings.warn(msg, FutureWarning, stacklevel=stacklevel) + return self._wrap_applied_output( keys, values, not_indexed_same=not_indexed_same ) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 86e0c4ac40699..650afc879861f 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1014,12 +1014,19 @@ def test_apply_result_type(group_keys, udf, is_transform): if group_keys: assert df_result.index.nlevels == 2 assert series_result.index.nlevels == 2 - elif group_keys is False: + else: assert df_result.index.nlevels == 1 assert series_result.index.nlevels == 1 - # elif is_transform: - # assert df_result.index.nlevels == 1 - # assert series_result.index.nlevels == 1 - # else: - # assert df_result.index.nlevels == 2 - # assert series_result.index.nlevels == 2 + + +def test_groupby_apply_group_keys_warns(): + df = pd.DataFrame({"A": [0, 1, 1], "B": [1, 2, 3]}) + with tm.assert_produces_warning(FutureWarning): + result = df.groupby("A").apply(lambda x: x) + + tm.assert_frame_equal(result, df) + + with tm.assert_produces_warning(FutureWarning): + result = df.groupby("A")["B"].apply(lambda x: x) + + tm.assert_series_equal(result, df["B"]) From 919a4c74b2870b808612808ab121942894b41f40 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 29 Jun 2020 09:07:43 -0500 Subject: [PATCH 10/67] remove debug --- pandas/core/resample.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 311406bc704cb..cf343335a1b4b 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -284,7 +284,6 @@ def pipe(self, func, *args, **kwargs): ) def aggregate(self, func, *args, **kwargs): - assert not self.group_keys self._set_binner() result, how = self._aggregate(func, *args, **kwargs) if result is None: From 4a45ea0f37e3f0e86f8673f06652a892dea2cf0e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 29 Jun 2020 09:11:31 -0500 Subject: [PATCH 11/67] warning --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/groupby/groupby.py | 4 +++- pandas/core/series.py | 2 +- pandas/tests/groupby/test_apply.py | 5 +++++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c5eb2febe8ae9..d0290bdf32b7f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -785,6 +785,7 @@ Deprecations - :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`) - :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`) - The ``fastpath`` keyword in the ``SingleBlockManager`` constructor is deprecated and will be removed in a future version (:issue:`33092`) +- :meth:`~DataFrame.groupby` no longer ignores the ``group_keys`` argument for functions passed to ``apply`` that return like-indexed outputs (:issue:`34809`). - :meth:`Index.is_mixed` is deprecated and will be removed in a future version, check ``index.inferred_type`` directly instead (:issue:`32922`) - Passing any arguments but the first one to :func:`read_html` as diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b8bdc2fdaddb0..1214b0384dd0c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -890,7 +890,9 @@ def _python_apply_general( not_indexed_same = mutated or self.mutated if not not_indexed_same and self.group_keys is None: - if self._selection is None: + if self.ndim == 1: + stacklevel = 4 + elif self._selection is None: stacklevel = 3 else: stacklevel = 4 diff --git a/pandas/core/series.py b/pandas/core/series.py index 54b85afea4964..39fb26a04fbed 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1692,7 +1692,7 @@ def groupby( level=None, as_index: bool = True, sort: bool = True, - group_keys: bool = True, + group_keys: Optional[bool] = None, squeeze: bool = no_default, observed: bool = False, dropna: bool = True, diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 650afc879861f..652bf4e84a34d 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1030,3 +1030,8 @@ def test_groupby_apply_group_keys_warns(): result = df.groupby("A")["B"].apply(lambda x: x) tm.assert_series_equal(result, df["B"]) + + with tm.assert_produces_warning(FutureWarning): + result = df["B"].groupby(df["A"]).apply(lambda x: x) + + tm.assert_series_equal(result, df["B"]) From 7f1478fc7ad6bc9dc943071c23499f88aa5d1f0e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 29 Jun 2020 09:12:06 -0500 Subject: [PATCH 12/67] warning --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4d78bd424e28c..6fba4c955383d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7692,7 +7692,7 @@ def resample( level=None, origin: Union[str, TimestampConvertibleTypes] = "start_day", offset: Optional[TimedeltaConvertibleTypes] = None, - group_keys: Optional[bool_t] = True, + group_keys: Optional[bool_t] = None, ) -> "Resampler": """ Resample time-series data. From a1d4da80c3597bd6962653c662e60e9249424e43 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 29 Jun 2020 09:31:47 -0500 Subject: [PATCH 13/67] warning --- pandas/core/frame.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/groupby/groupby.py | 5 +++-- pandas/core/series.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4ed0832d439f4..20745724a0717 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6445,7 +6445,7 @@ def groupby( level=None, as_index: bool = True, sort: bool = True, - group_keys: Optional[bool] = None, + group_keys: bool = no_default, squeeze: bool = no_default, observed: bool = False, dropna: bool = True, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6fba4c955383d..457773a1133e0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7692,7 +7692,7 @@ def resample( level=None, origin: Union[str, TimestampConvertibleTypes] = "start_day", offset: Optional[TimedeltaConvertibleTypes] = None, - group_keys: Optional[bool_t] = None, + group_keys: bool_t = lib.no_default, ) -> "Resampler": """ Resample time-series data. diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1214b0384dd0c..676c78667fb23 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -35,7 +35,7 @@ class providing the base-class of operations. from pandas._config.config import option_context -from pandas._libs import Timestamp +from pandas._libs import Timestamp, lib import pandas._libs.groupby as libgroupby from pandas._typing import F, FrameOrSeries, FrameOrSeriesUnion, Scalar from pandas.compat.numpy import function as nv @@ -889,7 +889,7 @@ def _python_apply_general( keys, values, mutated = self.grouper.apply(f, data, self.axis) not_indexed_same = mutated or self.mutated - if not not_indexed_same and self.group_keys is None: + if not not_indexed_same and self.group_keys is lib.no_default: if self.ndim == 1: stacklevel = 4 elif self._selection is None: @@ -907,6 +907,7 @@ def _python_apply_general( "\n\n\t>>> .groupby(..., group_keys=True)" ) warnings.warn(msg, FutureWarning, stacklevel=stacklevel) + self.group_keys = False # mutating a stateful object... return self._wrap_applied_output( keys, values, not_indexed_same=not_indexed_same diff --git a/pandas/core/series.py b/pandas/core/series.py index 39fb26a04fbed..55237c82ff575 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1692,7 +1692,7 @@ def groupby( level=None, as_index: bool = True, sort: bool = True, - group_keys: Optional[bool] = None, + group_keys: bool = no_default, squeeze: bool = no_default, observed: bool = False, dropna: bool = True, From 9c229c64841f241b3627297fd135b21bd1abd8f9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 29 Jun 2020 10:49:29 -0500 Subject: [PATCH 14/67] wip --- pandas/tests/groupby/test_apply.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 652bf4e84a34d..d27f714ac07de 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -44,7 +44,9 @@ def test_apply_issues(): ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date" ) expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) - result = df.groupby("date").apply(lambda x: x["time"][x["value"].idxmax()]) + result = df.groupby("date", group_keys=False).apply( + lambda x: x["time"][x["value"].idxmax()] + ) tm.assert_series_equal(result, expected) @@ -72,7 +74,9 @@ def test_apply_trivial2(): columns=["key", "data"], ) expected = pd.concat([df, df], axis=1, keys=["float64", "object"]) - result = df.groupby([str(x) for x in df.dtypes], axis=1).apply(lambda x: df) + result = df.groupby([str(x) for x in df.dtypes], axis=1, group_keys=True).apply( + lambda x: df + ) tm.assert_frame_equal(result, expected) @@ -181,7 +185,7 @@ def f_constant_df(group): for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]: del names[:] - df.groupby("a").apply(func) + df.groupby("a", group_keys=False).apply(func) assert names == group_names @@ -199,7 +203,9 @@ def test_group_apply_once_per_group2(capsys): index=["0", "2", "4", "6", "8", "10", "12", "14"], ) - df.groupby("group_by_column").apply(lambda df: print("function_called")) + df.groupby("group_by_column", group_keys=False).apply( + lambda df: print("function_called") + ) result = capsys.readouterr().out.count("function_called") # If `groupby` behaves unexpectedly, this test will break @@ -219,8 +225,8 @@ def slow(group): def fast(group): return group.copy() - fast_df = df.groupby("A").apply(fast) - slow_df = df.groupby("A").apply(slow) + fast_df = df.groupby("A", group_keys=False).apply(fast) + slow_df = df.groupby("A", group_keys=False).apply(slow) tm.assert_frame_equal(fast_df, slow_df) From f1a570bf90a3ea880135081c22f11b7deb4f9a22 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jun 2020 09:30:55 -0500 Subject: [PATCH 15/67] wip --- pandas/_libs/reduction.pyx | 12 +++++++++++- pandas/core/groupby/groupby.py | 18 ++++++++++++++---- pandas/core/groupby/ops.py | 8 ++++++-- pandas/tests/groupby/test_apply.py | 13 ++++++++----- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_grouping.py | 4 ++-- 6 files changed, 42 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 97c491776f831..ea28c25f06de5 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -370,7 +370,17 @@ def apply_frame_axis0(object frame, object f, object names, mutated = True except AttributeError: # `piece` might not have an index, could be e.g. an int - pass + # By definition, we are not a transform, so set mutated + # to True + mutated = True + if not mutated: + # Also check if the columns are mutated + try: + if not piece.columns.equals(chunk.columns): + mutated = True + except AttributeError: + mutated = True + if not is_scalar(piece): # Need to copy data to avoid appending references diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 676c78667fb23..110a3c0aa26c7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -762,8 +762,11 @@ def curried(x): if name in base.plotting_methods: return self.apply(curried) + is_transform = name in {"diff", "fillna", "tshift"} try: - return self._python_apply_general(curried, self._obj_with_exclusions) + return self._python_apply_general( + curried, self._obj_with_exclusions, is_transform=is_transform + ) except TypeError as err: if not re.search( "reduction operation '.*' not allowed for this dtype", str(err) @@ -869,7 +872,7 @@ def f(g): return result def _python_apply_general( - self, f: F, data: FrameOrSeriesUnion + self, f: F, data: FrameOrSeriesUnion, is_transform=None ) -> FrameOrSeriesUnion: """ Apply function f in python space @@ -880,6 +883,11 @@ def _python_apply_general( Function to apply data : Series or DataFrame Data to apply f to + is_transform : bool, optional + Indicator for whether the function is actually a transform + and should not have group keys prepended. This is used + in _make_wrapper which generates both transforms (e.g. diff) + and non-transforms (e.g. corr) Returns ------- @@ -906,8 +914,10 @@ def _python_apply_general( "To adopt the future behavior and silence this warning, use " "\n\n\t>>> .groupby(..., group_keys=True)" ) - warnings.warn(msg, FutureWarning, stacklevel=stacklevel) - self.group_keys = False # mutating a stateful object... + if not (is_transform is True): + warnings.warn(msg, FutureWarning, stacklevel=stacklevel) + # XXX: mutating a stateful object. Consider passing a var through wrap_applied_output + self.group_keys = False return self._wrap_applied_output( keys, values, not_indexed_same=not_indexed_same diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 74db87f46c5e2..42f9726f395da 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -901,8 +901,12 @@ def _is_indexed_like(obj, axes) -> bool: if len(axes) > 1: return False return obj.index.equals(axes[0]) - elif isinstance(obj, DataFrame): - return obj.index.equals(axes[0]) + + is_frame = isinstance(obj, DataFrame) + if is_frame and len(axes) == 1: + return True + elif is_frame: + return obj.index.equals(axes[0]) and obj.columns.equals(axes[1]) return False diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index d27f714ac07de..e944b7c96320f 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -84,7 +84,7 @@ def test_apply_trivial2(): def test_fast_apply(): # make sure that fast apply is correctly called # rather than raising any kind of error - # otherwise the python path will be callsed + # otherwise the python path will be called # which slows things down N = 1000 labels = np.random.randint(0, 2000, size=N) @@ -98,9 +98,12 @@ def test_fast_apply(): } ) - def f(g): + def f1(g): return 1 + def f2(g): + return g + g = df.groupby(["key", "key2"]) grouper = g.grouper @@ -109,9 +112,9 @@ def f(g): group_keys = grouper._get_group_keys() sdata = splitter._get_sorted_data() - values, mutated = splitter.fast_apply(f, sdata, group_keys) - - assert not mutated + for f, expected_mutated in [(f1, True), (f2, False)]: + values, mutated = splitter.fast_apply(f, sdata, group_keys) + assert mutated is expected_mutated @pytest.mark.parametrize( diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 76dc7bc66119e..2d31c059122ab 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1417,7 +1417,7 @@ def freduce(group): def foo(x): return freduce(x) - grouped = df.groupby(grouper) + grouped = df.groupby(grouper, group_keys=False) # make sure all these work grouped.apply(f) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index efcd22f9c0c82..19c0c2cc95491 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -144,10 +144,10 @@ def test_grouper_index_types(self): ]: df.index = index(len(df)) - df.groupby(list("abcde")).apply(lambda x: x) + df.groupby(list("abcde"), group_keys=False).apply(lambda x: x) df.index = list(reversed(df.index.tolist())) - df.groupby(list("abcde")).apply(lambda x: x) + df.groupby(list("abcde"), group_keys=False).apply(lambda x: x) def test_grouper_multilevel_freq(self): From 914c7cf2ab7cf8fe6cc2543a5074fcab3d67c451 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jun 2020 16:38:04 -0500 Subject: [PATCH 16/67] more --- pandas/core/groupby/groupby.py | 12 ++++++++---- pandas/tests/groupby/aggregate/test_aggregate.py | 4 ++-- pandas/tests/groupby/test_groupby.py | 4 +++- pandas/tests/groupby/test_grouping.py | 2 +- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 110a3c0aa26c7..3a50d684105d1 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -872,7 +872,7 @@ def f(g): return result def _python_apply_general( - self, f: F, data: FrameOrSeriesUnion, is_transform=None + self, f: F, data: FrameOrSeriesUnion, is_transform=False, is_empty_agg=False, ) -> FrameOrSeriesUnion: """ Apply function f in python space @@ -883,11 +883,15 @@ def _python_apply_general( Function to apply data : Series or DataFrame Data to apply f to - is_transform : bool, optional + is_transform : bool, default False Indicator for whether the function is actually a transform and should not have group keys prepended. This is used in _make_wrapper which generates both transforms (e.g. diff) and non-transforms (e.g. corr) + is_empty_agg : bool, default False + Indicator for whether the function is actually an aggregation + on an empty result. We don't want to warn for this case. + See _GroupBy._python_agg_general. Returns ------- @@ -914,7 +918,7 @@ def _python_apply_general( "To adopt the future behavior and silence this warning, use " "\n\n\t>>> .groupby(..., group_keys=True)" ) - if not (is_transform is True): + if not (is_transform or is_empty_agg): warnings.warn(msg, FutureWarning, stacklevel=stacklevel) # XXX: mutating a stateful object. Consider passing a var through wrap_applied_output self.group_keys = False @@ -1115,7 +1119,7 @@ def _python_agg_general( output[key] = maybe_cast_result(result, obj, numeric_only=True) if len(output) == 0: - return self._python_apply_general(f, self._selected_obj) + return self._python_apply_general(f, self._selected_obj, is_empty_agg=True) if self.grouper._filter_empty_groups: diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index dbd713a0af4cf..db13e2432227c 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -125,7 +125,7 @@ def test_groupby_aggregation_multi_level_column(): def test_agg_apply_corner(ts, tsframe): # nothing to group, all NA - grouped = ts.groupby(ts * np.nan) + grouped = ts.groupby(ts * np.nan, group_keys=False) assert ts.dtype == np.float64 # groupby float64 values results in Float64Index @@ -135,7 +135,7 @@ def test_agg_apply_corner(ts, tsframe): tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) # DataFrame - grouped = tsframe.groupby(tsframe["A"] * np.nan) + grouped = tsframe.groupby(tsframe["A"] * np.nan, group_keys=False) exp_df = DataFrame( columns=tsframe.columns, dtype=float, index=pd.Index([], dtype=np.float64) ) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2d31c059122ab..08db639d45f23 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -598,7 +598,9 @@ def test_as_index_select_column(): expected = pd.Series([2, 4], name="B") tm.assert_series_equal(result, expected) - result = df.groupby("A", as_index=False)["B"].apply(lambda x: x.cumsum()) + result = df.groupby("A", as_index=False, group_keys=True)["B"].apply( + lambda x: x.cumsum() + ) expected = pd.Series( [2, 6, 6], name="B", index=pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)]) ) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 19c0c2cc95491..89585ac69aa3c 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -631,7 +631,7 @@ def test_evaluate_with_empty_groups(self, func, expected): # (not testing other agg fns, because they return # different index objects. df = pd.DataFrame({1: [], 2: []}) - g = df.groupby(1) + g = df.groupby(1, group_keys=False) result = getattr(g[2], func)(lambda x: x) tm.assert_series_equal(result, expected) From 7fd1a07f2cc5b90c1f9b467f2b4f3b482824d251 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 1 Jul 2020 08:26:31 -0500 Subject: [PATCH 17/67] Add resample tests --- pandas/tests/resample/test_resample_api.py | 26 ++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 425cb34495655..9a4ab7bd1f1ad 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -91,6 +91,32 @@ def test_groupby_resample_on_api(): tm.assert_frame_equal(result, expected) +def test_resample_group_keys(): + df = pd.DataFrame({"A": 1, "B": 2}, index=pd.date_range("2000", periods=10)) + g = df.resample("5D") + expected = df.copy() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # stacklevel is set for groupby, not resample + result = g.apply(lambda x: x) + tm.assert_frame_equal(result, expected) + + # no warning + g = df.resample("5D", group_keys=False) + with tm.assert_produces_warning(None): + result = g.apply(lambda x: x) + tm.assert_frame_equal(result, expected) + + # no warning, group keys + expected.index = pd.MultiIndex.from_arrays( + [pd.to_datetime(["2000-01-01", "2000-01-06"]).repeat(5), expected.index] + ) + + g = df.resample("5D", group_keys=True) + with tm.assert_produces_warning(None): + result = g.apply(lambda x: x) + tm.assert_frame_equal(result, expected) + + def test_pipe(test_frame): # GH17905 From 76e687369c6985f9f6fb06a0ad173ebce645cb20 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 1 Jul 2020 08:34:04 -0500 Subject: [PATCH 18/67] extension --- pandas/tests/extension/base/groupby.py | 8 ++++---- pandas/tests/extension/test_boolean.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 94d0ef7bbea84..5faa2edd0b84a 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -53,10 +53,10 @@ def test_groupby_extension_transform(self, data_for_grouping): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - df.groupby("B").apply(groupby_apply_op) - df.groupby("B").A.apply(groupby_apply_op) - df.groupby("A").apply(groupby_apply_op) - df.groupby("A").B.apply(groupby_apply_op) + df.groupby("B", group_keys=False).apply(groupby_apply_op) + df.groupby("B", group_keys=False).A.apply(groupby_apply_op) + df.groupby("A", group_keys=False).apply(groupby_apply_op) + df.groupby("A", group_keys=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 725067951eeef..7a7ffa3861b38 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -290,10 +290,10 @@ def test_groupby_extension_transform(self, data_for_grouping): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) - df.groupby("B").apply(groupby_apply_op) - df.groupby("B").A.apply(groupby_apply_op) - df.groupby("A").apply(groupby_apply_op) - df.groupby("A").B.apply(groupby_apply_op) + df.groupby("B", group_keys=False).apply(groupby_apply_op) + df.groupby("B", group_keys=False).A.apply(groupby_apply_op) + df.groupby("A", group_keys=False).apply(groupby_apply_op) + df.groupby("A", group_keys=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) From 7f5cd0d1fd102dcdaa89232b0b6f681d168543a1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 1 Jul 2020 10:42:37 -0500 Subject: [PATCH 19/67] fixups --- pandas/tests/resample/test_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 28d33ebb23c20..c472ac97523ef 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -131,7 +131,7 @@ def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): # GH13212 df = empty_frame_dti # count retains dimensions too - result = getattr(df.resample(freq), resample_method)() + result = getattr(df.resample(freq, group_keys=False), resample_method)() if resample_method != "size": expected = df.copy() else: @@ -187,7 +187,7 @@ def test_resample_empty_dtypes(index, dtype, resample_method): # them to ensure they no longer do. (GH #10228) empty_series_dti = Series([], index, dtype) try: - getattr(empty_series_dti.resample("d"), resample_method)() + getattr(empty_series_dti.resample("d", group_keys=False), resample_method)() except DataError: # Ignore these since some combinations are invalid # (ex: doing mean with dtype of np.object) From d21dfb843b32a6d1cf4b13b71c4c65b5f136a126 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 1 Jul 2020 10:44:04 -0500 Subject: [PATCH 20/67] lint --- pandas/core/groupby/groupby.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3a50d684105d1..fc8bb6dd65907 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -920,7 +920,8 @@ def _python_apply_general( ) if not (is_transform or is_empty_agg): warnings.warn(msg, FutureWarning, stacklevel=stacklevel) - # XXX: mutating a stateful object. Consider passing a var through wrap_applied_output + # XXX: mutating a stateful object. Consider passing a var through + # wrap_applied_output self.group_keys = False return self._wrap_applied_output( From 6e253c362086b8802959c85687fe9e3b20a5866b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 1 Jul 2020 11:36:34 -0500 Subject: [PATCH 21/67] fix doc warning --- doc/source/whatsnew/v0.25.0.rst | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 3cd920158f774..63fbfea2069ef 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -342,10 +342,15 @@ Now every group is evaluated only a single time. *New behavior*: -.. ipython:: python - - df.groupby("a").apply(func) +.. code-block:: python + In [3]: df.groupby('a').apply(func) + x + y + Out[3]: + a b + 0 x 1 + 1 y 2 Concatenating sparse values ^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 8efe63235e264ceeb19fa9f09616dbfeace1e776 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 1 Jul 2020 11:38:43 -0500 Subject: [PATCH 22/67] lint --- pandas/_libs/reduction.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index ea28c25f06de5..7fa2e2e3e00fe 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -381,7 +381,6 @@ def apply_frame_axis0(object frame, object f, object names, except AttributeError: mutated = True - if not is_scalar(piece): # Need to copy data to avoid appending references try: From bda914e04719900a14adc5ca1301af2f1b415306 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 1 Jul 2020 11:53:20 -0500 Subject: [PATCH 23/67] whatsnew --- doc/source/whatsnew/v1.1.0.rst | 40 +++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 59341f092feed..e1c272d3c99bc 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -779,6 +779,45 @@ Development Changes Deprecations ~~~~~~~~~~~~ +:meth:`~DataFrame.groupby` no longer ignores ``group_keys`` for ``apply`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`~DataFrame.groupby` no longer ignores the ``group_keys`` argument for functions passed to ``apply`` that return like-indexed outputs (:issue:`34809`) +Previous versions of pandas would add the group keys only when the result from the applied function had a different index to the input. + +.. code-block:: python + + >>> # pandas 1.0.4 + >>> df = pd.DataFrame({"A": [1, 2, 2], "B": [1, 2, 3]}) + >>> df + A B + 0 1 1 + 1 2 2 + 2 2 3 + >>> df.groupby("A").apply(lambda x: x.rename(np.exp)) + A B + A + 1 1.000000 1 1 + 2 2.718282 2 2 + 7.389056 2 3 + + >>> df.groupby("A").apply(lambda x: x) + A B + 0 1 1 + 1 2 2 + 2 2 3 + +In this future this behavior will change + +.. ipython:: python + :okwarning: + + df = pd.DataFrame({"A": [1, 2, 2], "B": [1, 2, 3]}) + df.groupby("A").apply(lambda x: x) + +Other Deprecations +^^^^^^^^^^^^^^^^^^ + - Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version. Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`) - :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`) @@ -786,7 +825,6 @@ Deprecations - :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`) - :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`) - The ``fastpath`` keyword in the ``SingleBlockManager`` constructor is deprecated and will be removed in a future version (:issue:`33092`) -- :meth:`~DataFrame.groupby` no longer ignores the ``group_keys`` argument for functions passed to ``apply`` that return like-indexed outputs (:issue:`34809`). - Providing ``suffixes`` as a ``set`` in :func:`pandas.merge` is deprecated. Provide a tuple instead (:issue:`33740`, :issue:`34741`). - :meth:`Index.is_mixed` is deprecated and will be removed in a future version, check ``index.inferred_type`` directly instead (:issue:`32922`) From 80c789ee40e283b1487017ba8f48fce11f60e399 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 2 Jul 2020 15:07:30 -0500 Subject: [PATCH 24/67] ignore for now --- pandas/tests/groupby/test_apply.py | 12 ++++++++++++ pandas/tests/resample/test_period_index.py | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index e944b7c96320f..fa88332f56c8e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1044,3 +1044,15 @@ def test_groupby_apply_group_keys_warns(): result = df["B"].groupby(df["A"]).apply(lambda x: x) tm.assert_series_equal(result, df["B"]) + + +@pytest.mark.xfail +def test_resample_with_only_nat(self): + # BinGrouper and Grouper aren't consistent with NA key handling. + # Causes a false positive here. + # https://github.com/pandas-dev/pandas/pull/34998#issuecomment-652497050 + pi = pd.PeriodIndex([pd.NaT] * 3, freq="S") + frame = DataFrame([2, 3, 5], index=pi) + + with tm.assert_produces_warning(None): + frame.resample("1s").mean() diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index fe02eaef8ba82..f739a15367bb9 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -790,7 +790,7 @@ def test_resample_with_only_nat(self): frame = DataFrame([2, 3, 5], index=pi) expected_index = PeriodIndex(data=[], freq=pi.freq) expected = DataFrame(index=expected_index) - result = frame.resample("1s").mean() + result = frame.resample("1s", group_keys=False).mean() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( From c4f6e2d66b4f1db79fe4908b8cc3fdddb98b80ba Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 2 Jul 2020 15:16:35 -0500 Subject: [PATCH 25/67] avoid mutating --- pandas/core/groupby/generic.py | 49 +++++++++++++++++++++++++++++----- pandas/core/groupby/groupby.py | 28 ++++++++++++++----- 2 files changed, 63 insertions(+), 14 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 477d0fd859a22..16f0faa59b575 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -413,7 +413,13 @@ def _wrap_transformed_output( assert isinstance(result, Series) return result - def _wrap_applied_output(self, keys, values, not_indexed_same=False): + def _wrap_applied_output( + self, + keys, + values, + not_indexed_same: bool = False, + override_group_keys: bool = False, + ): if len(keys) == 0: # GH #6265 return self.obj._constructor( @@ -440,10 +446,20 @@ def _get_index() -> Index: return result if isinstance(values[0], Series): - return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) + return self._concat_objects( + keys, + values, + not_indexed_same=not_indexed_same, + override_group_keys=override_group_keys, + ) elif isinstance(values[0], DataFrame): # possible that Series -> DataFrame by applied function - return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) + return self._concat_objects( + keys, + values, + not_indexed_same=not_indexed_same, + override_group_keys=override_group_keys, + ) else: # GH #6265 #24880 result = self.obj._constructor( @@ -1203,7 +1219,13 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: return self.obj._constructor(result, columns=result_columns) - def _wrap_applied_output(self, keys, values, not_indexed_same=False): + def _wrap_applied_output( + self, + keys, + values, + not_indexed_same: bool = False, + override_group_keys: bool = False, + ): if len(keys) == 0: return self.obj._constructor(index=keys) @@ -1217,7 +1239,12 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # We'd prefer it return an empty dataframe. return self.obj._constructor() elif isinstance(first_not_none, DataFrame): - return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) + return self._concat_objects( + keys, + values, + not_indexed_same=not_indexed_same, + override_group_keys=override_group_keys, + ) else: if len(self.grouper.groupings) > 1: key_index = self.grouper.result_index @@ -1284,7 +1311,10 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # OR we don't have a multi-index and we have only a # single values return self._concat_objects( - keys, values, not_indexed_same=not_indexed_same + keys, + values, + not_indexed_same=not_indexed_same, + override_group_keys=override_group_keys, ) # still a series @@ -1296,7 +1326,12 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if not all_indexed_same: # GH 8467 - return self._concat_objects(keys, values, not_indexed_same=True) + return self._concat_objects( + keys, + values, + not_indexed_same=True, + override_group_keys=override_group_keys, + ) if self.axis == 0 and isinstance(v, ABCSeries): # GH6124 if the list of Series have a consistent name, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fc8bb6dd65907..5cb9db2ca5820 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -900,6 +900,7 @@ def _python_apply_general( """ keys, values, mutated = self.grouper.apply(f, data, self.axis) not_indexed_same = mutated or self.mutated + override_group_keys = False if not not_indexed_same and self.group_keys is lib.no_default: if self.ndim == 1: @@ -920,12 +921,13 @@ def _python_apply_general( ) if not (is_transform or is_empty_agg): warnings.warn(msg, FutureWarning, stacklevel=stacklevel) - # XXX: mutating a stateful object. Consider passing a var through - # wrap_applied_output - self.group_keys = False + override_group_keys = True return self._wrap_applied_output( - keys, values, not_indexed_same=not_indexed_same + keys, + values, + not_indexed_same=not_indexed_same, + override_group_keys=override_group_keys, ) def _iterate_slices(self) -> Iterable[Series]: @@ -1012,7 +1014,13 @@ def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]): def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): raise AbstractMethodError(self) - def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): + def _wrap_applied_output( + self, + keys, + values, + not_indexed_same: bool = False, + override_group_keys: bool = False, + ): raise AbstractMethodError(self) def _agg_general( @@ -1136,7 +1144,13 @@ def _python_agg_general( return self._wrap_aggregated_output(output) - def _concat_objects(self, keys, values, not_indexed_same: bool = False): + def _concat_objects( + self, + keys, + values, + not_indexed_same: bool = False, + override_group_keys: bool = False, + ): from pandas.core.reshape.concat import concat def reset_identity(values): @@ -1147,7 +1161,7 @@ def reset_identity(values): ax._reset_identity() return values - if self.group_keys: + if self.group_keys and not override_group_keys: values = reset_identity(values) if self.as_index: From d123a80352c733664bf436dea008148d2a3fbb0a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 2 Jul 2020 15:26:15 -0500 Subject: [PATCH 26/67] comment on override_group_keys --- pandas/core/groupby/groupby.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5cb9db2ca5820..34c9198720b6e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -902,7 +902,9 @@ def _python_apply_general( not_indexed_same = mutated or self.mutated override_group_keys = False - if not not_indexed_same and self.group_keys is lib.no_default: + if (not not_indexed_same and self.group_keys is lib.no_default) and not ( + is_transform or is_empty_agg + ): if self.ndim == 1: stacklevel = 4 elif self._selection is None: @@ -919,8 +921,12 @@ def _python_apply_general( "To adopt the future behavior and silence this warning, use " "\n\n\t>>> .groupby(..., group_keys=True)" ) - if not (is_transform or is_empty_agg): - warnings.warn(msg, FutureWarning, stacklevel=stacklevel) + warnings.warn(msg, FutureWarning, stacklevel=stacklevel) + # We want to behave as if `self.group_keys=False` when reconstructing + # the object. However, we don't want to mutate the stateful GroupBy + # object, so we just override it. + # When this deprecation is enforced then override_group_keys + # may be removed. override_group_keys = True return self._wrap_applied_output( From 9cb58a39148b8c1ed348b52cb949650df4c6a694 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 3 Jul 2020 06:19:49 -0500 Subject: [PATCH 27/67] fixups --- doc/source/user_guide/groupby.rst | 2 +- pandas/tests/generic/test_finalize.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index ddba3dc452e28..efea77f1c9713 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1019,7 +1019,7 @@ The dimension of the returned result can also change: .. ipython:: - In [8]: grouped = df.groupby('A')['C'] + In [8]: grouped = df.groupby('A', group_keys=False)['C'] In [10]: def f(group): ....: return pd.DataFrame({'original': group, diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 4d0f1a326225d..2a7412258bb3c 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -786,5 +786,5 @@ def test_categorical_accessor(method): @not_implemented_mark def test_groupby(obj, method): obj.attrs = {"a": 1} - result = method(obj.groupby([0, 0])) + result = method(obj.groupby([0, 0], group_keys=False)) assert result.attrs == {"a": 1} From 2e59629ccb1de3fc3a70a884a74547aaa3624da1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 6 Jul 2020 08:08:19 -0500 Subject: [PATCH 28/67] typing --- pandas/core/groupby/generic.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 16f0faa59b575..a9844c14c2ad4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -462,7 +462,9 @@ def _get_index() -> Index: ) else: # GH #6265 #24880 - result = self.obj._constructor( + # ignore Incompatible types in assignment (expression has type + # "Series", variable has type "DataFrame") + result = self.obj._constructor( # type: ignore data=values, index=_get_index(), name=self._selection_name ) return self._reindex_output(result) @@ -1280,11 +1282,16 @@ def _wrap_applied_output( # TODO: Remove when default dtype of empty Series is object kwargs = first_not_none._construct_axes_dict() if isinstance(first_not_none, Series): - backup = create_series_with_explicit_dtype( + # ignoring "create_series_with_explicit_dtype" gets + # multiple values for keyword argument "dtype_if_empty" + backup = create_series_with_explicit_dtype( # type: ignore **kwargs, dtype_if_empty=object ) else: - backup = first_not_none._constructor(**kwargs) + # ignore error: Incompatible types in assignment ( + # expression has type "NDFrame", variable has type + # "Series") + backup = first_not_none._constructor(**kwargs) # type: ignore values = [x if (x is not None) else backup for x in values] From bfb854e0d73ffa52d3c2adf2be6c4ccbc227e3d0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 6 Jul 2020 10:31:27 -0500 Subject: [PATCH 29/67] fixups --- doc/source/whatsnew/v1.1.0.rst | 12 ++++++------ pandas/core/generic.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 1dcdf33bebad1..9c5b4017abd62 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -780,10 +780,10 @@ Development Changes Deprecations ~~~~~~~~~~~~ -:meth:`~DataFrame.groupby` no longer ignores ``group_keys`` for ``apply`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:meth:`~DataFrame.groupby` no longer ignores ``group_keys`` for transform-like ``apply`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:meth:`~DataFrame.groupby` no longer ignores the ``group_keys`` argument for functions passed to ``apply`` that return like-indexed outputs (:issue:`34809`) +:meth:`~DataFrame.groupby` will no longer ignore the ``group_keys`` argument for functions passed to ``apply`` that return like-indexed outputs (:issue:`34809`). Previous versions of pandas would add the group keys only when the result from the applied function had a different index to the input. .. code-block:: python @@ -795,20 +795,20 @@ Previous versions of pandas would add the group keys only when the result from t 0 1 1 1 2 2 2 2 3 - >>> df.groupby("A").apply(lambda x: x.rename(np.exp)) + >>> df.groupby("A").apply(lambda x: x.rename(np.exp)) # Different index A B A 1 1.000000 1 1 2 2.718282 2 2 7.389056 2 3 - >>> df.groupby("A").apply(lambda x: x) + >>> df.groupby("A").apply(lambda x: x) # Same index A B 0 1 1 1 2 2 2 2 3 -In this future this behavior will change +In this future this behavior will change to always respect ``as_index``, which defaults to True. .. ipython:: python :okwarning: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 003253b5d4043..16fb59c127e6d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7764,7 +7764,7 @@ def resample( group_keys : bool, default True Whether to include the group keys in the result index when performing - a ``.groupby().apply()`` to to the resampled object. + a ``.groupby().apply()`` to the resampled object. .. versionadded:: 1.1.0 From 7469b29c5cdaf54d72ecf900e411e48d014c3fd6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 6 Jul 2020 14:37:46 -0500 Subject: [PATCH 30/67] mypy --- pandas/core/groupby/generic.py | 19 ++++++++----------- pandas/core/groupby/groupby.py | 2 +- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a9844c14c2ad4..ce9eb1c0ec6bb 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -30,7 +30,7 @@ import numpy as np from pandas._libs import lib -from pandas._typing import FrameOrSeries +from pandas._typing import FrameOrSeries, FrameOrSeriesUnion from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( @@ -419,7 +419,8 @@ def _wrap_applied_output( values, not_indexed_same: bool = False, override_group_keys: bool = False, - ): + ) -> FrameOrSeriesUnion: + result: FrameOrSeriesUnion if len(keys) == 0: # GH #6265 return self.obj._constructor( @@ -462,9 +463,7 @@ def _get_index() -> Index: ) else: # GH #6265 #24880 - # ignore Incompatible types in assignment (expression has type - # "Series", variable has type "DataFrame") - result = self.obj._constructor( # type: ignore + result = self.obj._constructor( data=values, index=_get_index(), name=self._selection_name ) return self._reindex_output(result) @@ -1227,7 +1226,7 @@ def _wrap_applied_output( values, not_indexed_same: bool = False, override_group_keys: bool = False, - ): + ) -> FrameOrSeriesUnion: if len(keys) == 0: return self.obj._constructor(index=keys) @@ -1276,11 +1275,12 @@ def _wrap_applied_output( # make Nones an empty object if first_not_none is None: return self.obj._constructor() - elif isinstance(first_not_none, NDFrame): + elif isinstance(first_not_none, (Series, DataFrame)): # this is to silence a DeprecationWarning # TODO: Remove when default dtype of empty Series is object kwargs = first_not_none._construct_axes_dict() + backup: FrameOrSeriesUnion if isinstance(first_not_none, Series): # ignoring "create_series_with_explicit_dtype" gets # multiple values for keyword argument "dtype_if_empty" @@ -1288,10 +1288,7 @@ def _wrap_applied_output( **kwargs, dtype_if_empty=object ) else: - # ignore error: Incompatible types in assignment ( - # expression has type "NDFrame", variable has type - # "Series") - backup = first_not_none._constructor(**kwargs) # type: ignore + backup = first_not_none._constructor(**kwargs) values = [x if (x is not None) else backup for x in values] diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 34c9198720b6e..b8c881182b8c0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -762,7 +762,7 @@ def curried(x): if name in base.plotting_methods: return self.apply(curried) - is_transform = name in {"diff", "fillna", "tshift"} + is_transform = name in base.transformation_kernels try: return self._python_apply_general( curried, self._obj_with_exclusions, is_transform=is_transform From 16e0f5ff3f00f4df46cc47d051f81cbb3de0f63d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 6 Jul 2020 14:39:36 -0500 Subject: [PATCH 31/67] mypy --- pandas/core/groupby/generic.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ce9eb1c0ec6bb..cb0d5babbea21 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1282,10 +1282,9 @@ def _wrap_applied_output( kwargs = first_not_none._construct_axes_dict() backup: FrameOrSeriesUnion if isinstance(first_not_none, Series): - # ignoring "create_series_with_explicit_dtype" gets - # multiple values for keyword argument "dtype_if_empty" - backup = create_series_with_explicit_dtype( # type: ignore - **kwargs, dtype_if_empty=object + kwargs["dtype_if_empty"] = object + backup = create_series_with_explicit_dtype( + **kwargs, ) else: backup = first_not_none._constructor(**kwargs) From 4173a3237135ffb45c8a8cf2c1659d4c9084eaf6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 6 Jul 2020 15:20:46 -0500 Subject: [PATCH 32/67] lint --- pandas/core/groupby/generic.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index cb0d5babbea21..66f0824248bf2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1283,9 +1283,7 @@ def _wrap_applied_output( backup: FrameOrSeriesUnion if isinstance(first_not_none, Series): kwargs["dtype_if_empty"] = object - backup = create_series_with_explicit_dtype( - **kwargs, - ) + backup = create_series_with_explicit_dtype(**kwargs) else: backup = first_not_none._constructor(**kwargs) From b6af0dab91b48ae92a8b31ddbd61e2289cf5ff5a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 12 Jul 2020 15:41:17 -0500 Subject: [PATCH 33/67] fixup --- pandas/core/groupby/ops.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 42f9726f395da..7444aed0f254b 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -904,8 +904,10 @@ def _is_indexed_like(obj, axes) -> bool: is_frame = isinstance(obj, DataFrame) if is_frame and len(axes) == 1: + # The UDF was DataFrame -> Series return True elif is_frame: + # The UDF was DataFrame -> DataFrame return obj.index.equals(axes[0]) and obj.columns.equals(axes[1]) return False From 8e46a6e6f9ceaf9ddb6d76f2d5641737bec16745 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 12 Jul 2020 15:44:41 -0500 Subject: [PATCH 34/67] change ref --- pandas/tests/groupby/test_apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index bf543d950d321..15ec2dbd216ff 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1048,9 +1048,9 @@ def test_groupby_apply_group_keys_warns(): @pytest.mark.xfail def test_resample_with_only_nat(self): + # https://github.com/pandas-dev/pandas/issues/35251 # BinGrouper and Grouper aren't consistent with NA key handling. # Causes a false positive here. - # https://github.com/pandas-dev/pandas/pull/34998#issuecomment-652497050 pi = pd.PeriodIndex([pd.NaT] * 3, freq="S") frame = DataFrame([2, 3, 5], index=pi) From d745c4a005e0f830c996a421eb1d1daf2587a796 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 12 Jul 2020 15:52:03 -0500 Subject: [PATCH 35/67] fixup whatsnew --- doc/source/whatsnew/v1.1.0.rst | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index fb0b66bab3360..25e8ddad3eb03 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -790,6 +790,8 @@ Deprecations :meth:`~DataFrame.groupby` will no longer ignore the ``group_keys`` argument for functions passed to ``apply`` that return like-indexed outputs (:issue:`34809`). Previous versions of pandas would add the group keys only when the result from the applied function had a different index to the input. +*Previous behavior*: + .. code-block:: python >>> # pandas 1.0.4 @@ -812,12 +814,21 @@ Previous versions of pandas would add the group keys only when the result from t 1 2 2 2 2 3 -In this future this behavior will change to always respect ``as_index``, which defaults to True. +In this future this behavior will change to always respect ``group_keys``, which defaults to True. + +*New behavior*: .. ipython:: python - :okwarning: df = pd.DataFrame({"A": [1, 2, 2], "B": [1, 2, 3]}) + df.groupby("A").apply(lambda x: x, group_keys=True) + df.groupby("A").apply(lambda x: x.rename(np.exp), group_keys=True) + +A warning will be issued if the result would change from pandas 1.0.4 + +.. ipython:: python + :okwarning: + df.groupby("A").apply(lambda x: x) Other Deprecations From 23caf8ed4988e9dab208e8517dc920f0c7b673f9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 12 Jul 2020 15:52:11 -0500 Subject: [PATCH 36/67] fixup whatsnew --- doc/source/whatsnew/v1.1.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 25e8ddad3eb03..d1be7a8eb3781 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -821,8 +821,8 @@ In this future this behavior will change to always respect ``group_keys``, which .. ipython:: python df = pd.DataFrame({"A": [1, 2, 2], "B": [1, 2, 3]}) - df.groupby("A").apply(lambda x: x, group_keys=True) - df.groupby("A").apply(lambda x: x.rename(np.exp), group_keys=True) + df.groupby("A", group_keys=True).apply(lambda x: x) + df.groupby("A", group_keys=True).apply(lambda x: x.rename(np.exp)) A warning will be issued if the result would change from pandas 1.0.4 From 3d4a74484552103badf887e4e860e01ed0d739b6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 13 Jul 2020 10:35:47 -0500 Subject: [PATCH 37/67] doc --- pandas/core/frame.py | 2 +- pandas/core/generic.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9932b24f24d73..6a52be5bd800c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6450,7 +6450,7 @@ def update( b 12.3 123.0 NaN 12.3 33.0 -To exclude or include the group keys in the index, specify ``group_keys`` +When using ``.apply()``, use ``group_keys`` to include or exclude the group keys. >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', ... 'Parrot', 'Parrot'], diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6f76c92c77d24..c09df60a0a99e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7387,8 +7387,18 @@ def clip( Sort group keys. Get better performance by turning this off. Note this does not influence the order of observations within each group. Groupby preserves the order of rows within each group. - group_keys : bool, default True + group_keys : bool, optional When calling apply, add group keys to index to identify pieces. + By default group keys are not included when the result's index + (and column) labels match the inputs, and are included otherwise. + + .. versionchanged:: 1.1.0 + + Warns that `group_keys` will no longer be ignored when the + result from ``apply`` is a like-indexed Series or DataFrame. + Specify ``group_keys`` explicitly to include the group keys or + not. + squeeze : bool, default False Reduce the dimensionality of the return type if possible, otherwise return a consistent type. From 20d8520acb586ec237203cc9d30d785d77bea0e2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 6 Aug 2020 14:26:11 -0500 Subject: [PATCH 38/67] remove xpass --- doc/source/user_guide/groupby.rst | 14 ++++++-------- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/generic.py | 4 ++-- pandas/core/groupby/groupby.py | 2 ++ pandas/tests/groupby/test_apply.py | 7 ++----- 5 files changed, 13 insertions(+), 16 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index efea77f1c9713..a8276bd865f28 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1017,16 +1017,14 @@ for both ``aggregate`` and ``transform`` in many standard use cases. However, The dimension of the returned result can also change: -.. ipython:: - - In [8]: grouped = df.groupby('A', group_keys=False)['C'] +.. ipython:: python - In [10]: def f(group): - ....: return pd.DataFrame({'original': group, - ....: 'demeaned': group - group.mean()}) - ....: + grouped = df.groupby('A', group_keys=False)['C'] - In [11]: grouped.apply(f) + def f(group): + return pd.DataFrame({'original': group, + 'demeaned': group - group.mean()}) + grouped.apply(f) ``apply`` on a Series can operate on a returned value from the applied function, that is itself a series, and possibly upcast the result to a DataFrame: diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3c734cce63073..3d105dde5f55a 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -32,7 +32,7 @@ Deprecations :meth:`~DataFrame.groupby` no longer ignores ``group_keys`` for transform-like ``apply`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:meth:`~DataFrame.groupby` will no longer ignore the ``group_keys`` argument for functions passed to ``apply`` that return like-indexed outputs (:issue:`34809`). +:meth:`~DataFrame.groupby` will no longer ignore the ``group_keys`` argument for functions passed to ``apply`` that return like-indexed outputs (:issue:`34809`, :issue:`31612`, :issue:`14927`, :issue:`13056`). Previous versions of pandas would add the group keys only when the result from the applied function had a different index to the input. *Previous behavior*: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba7aff33a7ac7..cd5ee78045140 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7400,7 +7400,7 @@ def clip( By default group keys are not included when the result's index (and column) labels match the inputs, and are included otherwise. - .. versionchanged:: 1.1.0 + .. versionchanged:: 1.2.0 Warns that `group_keys` will no longer be ignored when the result from ``apply`` is a like-indexed Series or DataFrame. @@ -7783,7 +7783,7 @@ def resample( Whether to include the group keys in the result index when performing a ``.groupby().apply()`` to the resampled object. - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Returns ------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7ddb63e007261..041d72c19ce95 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -910,6 +910,8 @@ def _python_apply_general( if (not not_indexed_same and self.group_keys is lib.no_default) and not ( is_transform or is_empty_agg ): + # We've detected value-dependent behavior: the result's index depends on + # whether the user's function `f` returned the same index or not. if self.ndim == 1: stacklevel = 4 elif self._selection is None: diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 49246fb319661..52cd04e838d48 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -215,7 +215,6 @@ def test_group_apply_once_per_group2(capsys): assert result == expected -@pytest.mark.xfail(reason="GH-34998") def test_apply_fast_slow_identical(): # GH 31613 @@ -239,11 +238,9 @@ def fast(group): "func", [ lambda x: x, - pytest.param(lambda x: x[:], marks=pytest.mark.xfail(reason="GH-34998")), + lambda x: x[:], lambda x: x.copy(deep=False), - pytest.param( - lambda x: x.copy(deep=True), marks=pytest.mark.xfail(reason="GH-34998") - ), + lambda x: x.copy(deep=True), ], ) def test_groupby_apply_identity_maybecopy_index_identical(func): From 7cf051a5c40c4756737e9341c23e1ff97faa21e5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 7 Aug 2020 08:40:04 -0500 Subject: [PATCH 39/67] Fixup --- pandas/tests/groupby/test_function.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 557cd05f450b6..2ebf08b9413b7 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -952,6 +952,11 @@ def test_frame_describe_multikey(tsframe): groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) result = groupedT.describe() expected = tsframe.describe().T + # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ + expected.index = pd.MultiIndex( + levels=[[0, 1], expected.index], + codes=[[0, 0, 1, 1], range(len(expected.index))], + ) tm.assert_frame_equal(result, expected) From b3c8d53d4183f06a299d802100a8e221abc087ea Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 2 Jan 2021 17:15:17 -0500 Subject: [PATCH 40/67] merge cleanup and pass tests --- pandas/core/frame.py | 2 + pandas/core/groupby/generic.py | 8 +-- pandas/core/groupby/groupby.py | 57 ++++++------------- pandas/core/series.py | 2 + .../tests/frame/apply/test_frame_transform.py | 4 +- pandas/tests/groupby/test_function.py | 6 +- .../tests/groupby/transform/test_transform.py | 10 +++- .../series/apply/test_series_transform.py | 2 +- pandas/tests/window/test_groupby.py | 9 +-- 9 files changed, 46 insertions(+), 54 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5870131e9ec02..53c768348121c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9391,6 +9391,7 @@ def resample( level=None, origin: Union[str, "TimestampConvertibleTypes"] = "start_day", offset: Optional["TimedeltaConvertibleTypes"] = None, + group_keys: bool = no_default, ) -> "Resampler": return super().resample( rule=rule, @@ -9405,6 +9406,7 @@ def resample( level=level, origin=origin, offset=offset, + group_keys=group_keys, ) def to_timestamp( diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2dc0d7bb62ed1..6bda5315d5083 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1612,13 +1612,13 @@ def _gotitem(self, key, ndim: int, subset=None): subset = self.obj[key] return SeriesGroupBy( subset, - selection=key, - grouper=self.grouper, - observed=self.observed, - group_keys=self.group_keys, level=self.level, + grouper=self.grouper, + selection=key, sort=self.sort, + group_keys=self.group_keys, squeeze=self.squeeze, + observed=self.observed, dropna=self.dropna, ) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6fd49b2f7a964..d4244d619a846 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -11,7 +11,6 @@ class providing the base-class of operations. import datetime from functools import partial, wraps import inspect -import re from textwrap import dedent import types from typing import ( @@ -830,27 +829,10 @@ def curried(x): if name in base.plotting_methods: return self.apply(curried) - # TODO: master has this as just a call to python_apply_general is_transform = name in base.transformation_kernels - try: - return self._python_apply_general( - curried, self._obj_with_exclusions, is_transform=is_transform - ) - except TypeError as err: - if not re.search( - "reduction operation '.*' not allowed for this dtype", str(err) - ): - # We don't have a cython implementation - # TODO: is the above comment accurate? - raise - - if self.obj.ndim == 1: - # this can be called recursively, so need to raise ValueError - raise ValueError - - # GH#3688 try to operate item-by-item - result = self._aggregate_item_by_item(name, *args, **kwargs) - return result + return self._python_apply_general( + curried, self._obj_with_exclusions, is_transform=is_transform + ) wrapper.__name__ = name return wrapper @@ -1266,28 +1248,23 @@ def _python_agg_general(self, func, *args, **kwargs): assert result is not None key = base.OutputKey(label=name, position=idx) - # TODO: commented is from PR - # if len(output) == 0: - # return self._python_apply_general( - # f, self._selected_obj, is_empty_agg=True - # ) - if is_numeric_dtype(obj.dtype): - result = maybe_downcast_numeric(result, obj.dtype) + if is_numeric_dtype(obj.dtype): + result = maybe_downcast_numeric(result, obj.dtype) - if self.grouper._filter_empty_groups: - mask = counts.ravel() > 0 + if self.grouper._filter_empty_groups: + mask = counts.ravel() > 0 - # since we are masking, make sure that we have a float object - values = result - if is_numeric_dtype(values.dtype): - values = ensure_float(values) + # since we are masking, make sure that we have a float object + values = result + if is_numeric_dtype(values.dtype): + values = ensure_float(values) - result = maybe_downcast_numeric(values[mask], result.dtype) + result = maybe_downcast_numeric(values[mask], result.dtype) - output[key] = result + output[key] = result if not output: - return self._python_apply_general(f, self._selected_obj) + return self._python_apply_general(f, self._selected_obj, is_empty_agg=True) return self._wrap_aggregated_output(output, index=self.grouper.result_index) @@ -1336,19 +1313,19 @@ def reset_identity(values): elif not not_indexed_same: result = concat(values, axis=self.axis) - ax = self._selected_obj._get_axis(self.axis) + ax = self.filter(lambda x: True).axes[self.axis] # this is a very unfortunate situation # we can't use reindex to restore the original order # when the ax has duplicates # so we resort to this # GH 14776, 30667 - if ax.has_duplicates: + if ax.has_duplicates and not result.axes[self.axis].equals(ax): indexer, _ = result.index.get_indexer_non_unique(ax.values) indexer = algorithms.unique1d(indexer) result = result.take(indexer, axis=self.axis) else: - result = result.reindex(ax, axis=self.axis) + result = result.reindex(ax, axis=self.axis, copy=False) else: values = reset_identity(values) diff --git a/pandas/core/series.py b/pandas/core/series.py index fa356f79b38bb..4eadf55ba4785 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4884,6 +4884,7 @@ def resample( level=None, origin: Union[str, "TimestampConvertibleTypes"] = "start_day", offset: Optional["TimedeltaConvertibleTypes"] = None, + group_keys: bool = no_default, ) -> "Resampler": return super().resample( rule=rule, @@ -4898,6 +4899,7 @@ def resample( level=level, origin=origin, offset=offset, + group_keys=group_keys, ) def to_timestamp(self, freq=None, how="start", copy=True) -> "Series": diff --git a/pandas/tests/frame/apply/test_frame_transform.py b/pandas/tests/frame/apply/test_frame_transform.py index db5b2f3d86dfe..e6a552b5097f4 100644 --- a/pandas/tests/frame/apply/test_frame_transform.py +++ b/pandas/tests/frame/apply/test_frame_transform.py @@ -51,7 +51,9 @@ def test_transform_groupby_kernel(axis, float_frame, op): ones = np.ones(float_frame.shape[0]) else: ones = np.ones(float_frame.shape[1]) - expected = float_frame.groupby(ones, axis=axis).transform(op, *args) + expected = float_frame.groupby(ones, axis=axis, group_keys=False).transform( + op, *args + ) result = float_frame.transform(op, axis, *args) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index bfc012628b32c..fec4806056578 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -543,7 +543,7 @@ def test_idxmin_idxmax_axis1(): df = DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"]) df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] - gb = df.groupby("A") + gb = df.groupby("A", group_keys=True) res = gb.idxmax(axis=1) @@ -965,7 +965,9 @@ def test_frame_describe_multikey(tsframe): expected = pd.concat(desc_groups, axis=1) tm.assert_frame_equal(result, expected) - groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) + groupedT = tsframe.groupby( + {"A": 0, "B": 0, "C": 1, "D": 1}, axis=1, group_keys=True + ) result = groupedT.describe() expected = tsframe.describe().T # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index dd1bdc06755ff..a90125494c9ba 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -175,8 +175,14 @@ def test_transform_axis_1(transformation_func): args = ("ffill",) if transformation_func == "fillna" else () df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) - result = df.groupby([0, 0, 1], axis=1).transform(transformation_func, *args) - expected = df.T.groupby([0, 0, 1]).transform(transformation_func, *args).T + result = df.groupby([0, 0, 1], axis=1, group_keys=False).transform( + transformation_func, *args + ) + expected = ( + df.T.groupby([0, 0, 1], group_keys=False) + .transform(transformation_func, *args) + .T + ) if transformation_func == "diff": # Result contains nans, so transpose coerces to float diff --git a/pandas/tests/series/apply/test_series_transform.py b/pandas/tests/series/apply/test_series_transform.py index 992aaa540a65f..702069d7ef721 100644 --- a/pandas/tests/series/apply/test_series_transform.py +++ b/pandas/tests/series/apply/test_series_transform.py @@ -19,7 +19,7 @@ def test_transform_groupby_kernel(string_series, op): args = [0.0] if op == "fillna" else [] ones = np.ones(string_series.shape[0]) - expected = string_series.groupby(ones).transform(op, *args) + expected = string_series.groupby(ones, group_keys=False).transform(op, *args) result = string_series.transform(op, 0, *args) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index f915da3330ba7..37b3501397e08 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -757,7 +757,7 @@ class TestEWM: def test_methods(self, method, expected_data): # GH 16037 df = DataFrame({"A": ["a"] * 4, "B": range(4)}) - result = getattr(df.groupby("A").ewm(com=1.0), method)() + result = getattr(df.groupby("A", group_keys=True).ewm(com=1.0), method)() expected = DataFrame( {"B": expected_data}, index=MultiIndex.from_tuples( @@ -772,9 +772,10 @@ def test_methods(self, method, expected_data): ) tm.assert_frame_equal(result, expected) - expected = df.groupby("A").apply(lambda x: getattr(x.ewm(com=1.0), method)()) - # There may be a bug in the above statement; not returning the correct index - tm.assert_frame_equal(result.reset_index(drop=True), expected) + expected = df.groupby("A", group_keys=True).apply( + lambda x: getattr(x.ewm(com=1.0), method)() + ) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "method, expected_data", From 08ae03fb080838e6a9333e35fd27501f1b214bbe Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 3 Jan 2021 09:39:58 -0500 Subject: [PATCH 41/67] Move docs to 1.3, avoid warnings in tests --- doc/source/whatsnew/v1.2.0.rst | 53 ------------------- doc/source/whatsnew/v1.3.0.rst | 51 ++++++++++++++++++ pandas/core/generic.py | 83 +----------------------------- pandas/core/groupby/groupby.py | 4 +- pandas/core/groupby/ops.py | 1 - pandas/core/shared_docs.py | 16 +++++- pandas/tests/groupby/test_apply.py | 2 +- 7 files changed, 70 insertions(+), 140 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e8afd25b4ae6f..8e9361125513b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -512,59 +512,6 @@ Other API changes Deprecations ~~~~~~~~~~~~ - -:meth:`~DataFrame.groupby` no longer ignores ``group_keys`` for transform-like ``apply`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:meth:`~DataFrame.groupby` will no longer ignore the ``group_keys`` argument for functions passed to ``apply`` that return like-indexed outputs (:issue:`34809`, :issue:`31612`, :issue:`14927`, :issue:`13056`). -Previous versions of pandas would add the group keys only when the result from the applied function had a different index to the input. - -*Previous behavior*: - -.. code-block:: python - - >>> # pandas 1.0.4 - >>> df = pd.DataFrame({"A": [1, 2, 2], "B": [1, 2, 3]}) - >>> df - A B - 0 1 1 - 1 2 2 - 2 2 3 - >>> df.groupby("A").apply(lambda x: x.rename(np.exp)) # Different index - A B - A - 1 1.000000 1 1 - 2 2.718282 2 2 - 7.389056 2 3 - - >>> df.groupby("A").apply(lambda x: x) # Same index - A B - 0 1 1 - 1 2 2 - 2 2 3 - -In this future this behavior will change to always respect ``group_keys``, which defaults to True. - -*New behavior*: - -.. ipython:: python - - df = pd.DataFrame({"A": [1, 2, 2], "B": [1, 2, 3]}) - df.groupby("A", group_keys=True).apply(lambda x: x) - df.groupby("A", group_keys=True).apply(lambda x: x.rename(np.exp)) - -A warning will be issued if the result would change from pandas 1.0.4 - -.. ipython:: python - :okwarning: - - df.groupby("A").apply(lambda x: x) - - -Other Deprecations -^^^^^^^^^^^^^^^^^^ - -- - Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) - Deprecated parameter ``dtype`` of method :meth:`~Index.copy` for all :class:`Index` subclasses. Use the :meth:`~Index.astype` method instead for changing dtype (:issue:`35853`) - Deprecated parameters ``levels`` and ``codes`` in :meth:`MultiIndex.copy`. Use the :meth:`~MultiIndex.set_levels` and :meth:`~MultiIndex.set_codes` methods instead (:issue:`36685`) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 5e84947cd42f1..3d5c8e69c4814 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -152,6 +152,57 @@ Other API changes Deprecations ~~~~~~~~~~~~ + +:meth:`~DataFrame.groupby` no longer ignores ``group_keys`` for transform-like ``apply`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`~DataFrame.groupby` will no longer ignore the ``group_keys`` argument for functions passed to ``apply`` that return like-indexed outputs (:issue:`34809`, :issue:`31612`, :issue:`14927`, :issue:`13056`). +Previous versions of pandas would add the group keys only when the result from the applied function had a different index to the input. + +*Previous behavior*: + +.. code-block:: python + + >>> # pandas 1.2 + >>> df = pd.DataFrame({"A": [1, 2, 2], "B": [1, 2, 3]}) + >>> df + A B + 0 1 1 + 1 2 2 + 2 2 3 + >>> df.groupby("A").apply(lambda x: x.rename(np.exp)) # Different index + A B + A + 1 1.000000 1 1 + 2 2.718282 2 2 + 7.389056 2 3 + + >>> df.groupby("A").apply(lambda x: x) # Same index + A B + 0 1 1 + 1 2 2 + 2 2 3 + +In this future this behavior will change to always respect ``group_keys``, which defaults to True. + +*New behavior*: + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 2, 2], "B": [1, 2, 3]}) + df.groupby("A", group_keys=True).apply(lambda x: x) + df.groupby("A", group_keys=True).apply(lambda x: x.rename(np.exp)) + +A warning will be issued if the result would change from pandas 1.2 + +.. ipython:: python + :okwarning: + + df.groupby("A").apply(lambda x: x) + + +Other Deprecations +^^^^^^^^^^^^^^^^^^ - Deprecating allowing scalars passed to the :class:`Categorical` constructor (:issue:`38433`) - Deprecated allowing subclass-specific keyword arguments in the :class:`Index` constructor, use the specific subclass directly instead (:issue:`14093`,:issue:`21311`,:issue:`22315`,:issue:`26974`) - Deprecated ``astype`` of datetimelike (``timedelta64[ns]``, ``datetime64[ns]``, ``Datetime64TZDtype``, ``PeriodDtype``) to integer dtypes, use ``values.view(...)`` instead (:issue:`38544`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fbcf5f352d5ae..cdc0318dfc22b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7451,87 +7451,6 @@ def clip( return result - _shared_docs[ - "groupby" - ] = """ - Group %(klass)s using a mapper or by a Series of columns. - - A groupby operation involves some combination of splitting the - object, applying a function, and combining the results. This can be - used to group large amounts of data and compute operations on these - groups. - - Parameters - ---------- - by : mapping, function, label, or list of labels - Used to determine the groups for the groupby. - If ``by`` is a function, it's called on each value of the object's - index. If a dict or Series is passed, the Series or dict VALUES - will be used to determine the groups (the Series' values are first - aligned; see ``.align()`` method). If an ndarray is passed, the - values are used as-is determine the groups. A label or list of - labels may be passed to group by the columns in ``self``. Notice - that a tuple is interpreted as a (single) key. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Split along rows (0) or columns (1). - level : int, level name, or sequence of such, default None - If the axis is a MultiIndex (hierarchical), group by a particular - level or levels. - as_index : bool, default True - For aggregated output, return object with group labels as the - index. Only relevant for DataFrame input. as_index=False is - effectively "SQL-style" grouped output. - sort : bool, default True - Sort group keys. Get better performance by turning this off. - Note this does not influence the order of observations within each - group. Groupby preserves the order of rows within each group. - group_keys : bool, optional - When calling apply, add group keys to index to identify pieces. - By default group keys are not included when the result's index - (and column) labels match the inputs, and are included otherwise. - - .. versionchanged:: 1.2.0 - - Warns that `group_keys` will no longer be ignored when the - result from ``apply`` is a like-indexed Series or DataFrame. - Specify ``group_keys`` explicitly to include the group keys or - not. - - squeeze : bool, default False - Reduce the dimensionality of the return type if possible, - otherwise return a consistent type. - - .. deprecated:: 1.1.0 - - observed : bool, default False - This only applies if any of the groupers are Categoricals. - If True: only show observed values for categorical groupers. - If False: show all values for categorical groupers. - - .. versionadded:: 0.23.0 - dropna : bool, default True - If True, and if group keys contain NA values, NA values together - with row/column will be dropped. - If False, NA values will also be treated as the key in groups - - .. versionadded:: 1.1.0 - - Returns - ------- - %(klass)sGroupBy - Returns a groupby object that contains information about the groups. - - See Also - -------- - resample : Convenience method for frequency conversion and resampling - of time series. - - Notes - ----- - See the `user guide - `_ for more. - """ - @doc(**_shared_doc_kwargs) def asfreq( self: FrameOrSeries, @@ -7896,7 +7815,7 @@ def resample( Whether to include the group keys in the result index when performing a ``.groupby().apply()`` to the resampled object. - .. versionadded:: 1.2.0 + .. versionadded:: 1.3.0 Returns ------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d4244d619a846..20fb723ce2f6a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2835,7 +2835,9 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0 fill_method = "pad" limit = 0 filled = getattr(self, fill_method)(limit=limit) - fill_grp = filled.groupby(self.grouper.codes, axis=self.axis) + fill_grp = filled.groupby( + self.grouper.codes, axis=self.axis, group_keys=self.group_keys + ) shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis) return (filled / shifted) - 1 diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index af35d3a6a38c9..dd86b5c1a75b7 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -233,7 +233,6 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): # group might be modified group_axes = group.axes res = f(group) - # if not _is_indexed_like(res, group_axes, axis): if not _is_indexed_like(res, group_axes): mutated = True result_values.append(res) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index c1aed4eb3409b..5ba3bcbf4f443 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -107,8 +107,18 @@ Sort group keys. Get better performance by turning this off. Note this does not influence the order of observations within each group. Groupby preserves the order of rows within each group. -group_keys : bool, default True - When calling ``groupby().apply()``, add group keys to index to identify pieces. +group_keys : bool, optional + When calling apply, add group keys to index to identify pieces. + By default group keys are not included when the result's index + (and column) labels match the inputs, and are included otherwise. + + .. versionchanged:: 1.3.0 + + Warns that `group_keys` will no longer be ignored when the + result from ``apply`` is a like-indexed Series or DataFrame. + Specify ``group_keys`` explicitly to include the group keys or + not. + squeeze : bool, default False Reduce the dimensionality of the return type if possible, otherwise return a consistent type. @@ -119,6 +129,8 @@ This only applies if any of the groupers are Categoricals. If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. + + .. versionadded:: 0.23.0 dropna : bool, default True If True, and if group keys contain NA values, NA values together with row/column will be dropped. diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index da8a8a04e2816..0fa33eb54c360 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1161,7 +1161,7 @@ def test_apply_dropna_with_indexed_same(): }, index=list("xxyxz"), ) - result = df.groupby("group").apply(lambda x: x) + result = df.groupby("group", group_keys=False).apply(lambda x: x) expected = DataFrame( { "col": [1, 4, 5], From f0551938438b29fefc953ab1a94d31a25f505116 Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Sun, 3 Jan 2021 14:28:57 -0500 Subject: [PATCH 42/67] Revert group_keys=False when apply is not used --- pandas/core/groupby/groupby.py | 2 +- .../tests/frame/apply/test_frame_transform.py | 4 +-- pandas/tests/groupby/test_apply.py | 4 +-- pandas/tests/groupby/test_function.py | 6 ++-- pandas/tests/groupby/test_groupby.py | 8 ++---- pandas/tests/groupby/test_grouping.py | 2 +- .../tests/groupby/transform/test_transform.py | 28 ++++--------------- .../series/apply/test_series_transform.py | 2 +- 8 files changed, 17 insertions(+), 39 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 20fb723ce2f6a..8eb26888b75b0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -993,7 +993,7 @@ def _python_apply_general( keys, values, not_indexed_same=not_indexed_same, - override_group_keys=override_group_keys, + override_group_keys=is_transform or override_group_keys, ) def _iterate_slices(self) -> Iterable[Series]: diff --git a/pandas/tests/frame/apply/test_frame_transform.py b/pandas/tests/frame/apply/test_frame_transform.py index e6a552b5097f4..db5b2f3d86dfe 100644 --- a/pandas/tests/frame/apply/test_frame_transform.py +++ b/pandas/tests/frame/apply/test_frame_transform.py @@ -51,9 +51,7 @@ def test_transform_groupby_kernel(axis, float_frame, op): ones = np.ones(float_frame.shape[0]) else: ones = np.ones(float_frame.shape[1]) - expected = float_frame.groupby(ones, axis=axis, group_keys=False).transform( - op, *args - ) + expected = float_frame.groupby(ones, axis=axis).transform(op, *args) result = float_frame.transform(op, axis, *args) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 0fa33eb54c360..cc54001eda453 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1045,11 +1045,9 @@ def test_groupby_apply_group_keys_warns(): tm.assert_series_equal(result, df["B"]) -@pytest.mark.xfail +@pytest.mark.xfail("BinGrouper and Grouper aren't consistent with NA key handling") def test_resample_with_only_nat(self): # https://github.com/pandas-dev/pandas/issues/35251 - # BinGrouper and Grouper aren't consistent with NA key handling. - # Causes a false positive here. pi = pd.PeriodIndex([pd.NaT] * 3, freq="S") frame = DataFrame([2, 3, 5], index=pi) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index fec4806056578..2b5f3c8dac747 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -403,10 +403,10 @@ def test_cython_api2(): tm.assert_frame_equal(result, expected) # GH 13994 - result = df.groupby("A", group_keys=False).cumsum(axis=1) + result = df.groupby("A").cumsum(axis=1) expected = df.cumsum(axis=1) tm.assert_frame_equal(result, expected) - result = df.groupby("A", group_keys=False).cumprod(axis=1) + result = df.groupby("A").cumprod(axis=1) expected = df.cumprod(axis=1) tm.assert_frame_equal(result, expected) @@ -543,7 +543,7 @@ def test_idxmin_idxmax_axis1(): df = DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"]) df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] - gb = df.groupby("A", group_keys=True) + gb = df.groupby("A") res = gb.idxmax(axis=1) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 91ff57c0d5f74..62b579bf929ad 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -39,7 +39,7 @@ def test_basic(dtype): np.random.shuffle(index) data = data.reindex(index) - grouped = data.groupby(lambda x: x // 3, group_keys=False) + grouped = data.groupby(lambda x: x // 3) for k, v in grouped: assert len(v) == 3 @@ -608,9 +608,7 @@ def test_as_index_select_column(): expected = Series([2, 4], name="B") tm.assert_series_equal(result, expected) - result = df.groupby("A", as_index=False, group_keys=True)["B"].apply( - lambda x: x.cumsum() - ) + result = df.groupby("A", as_index=False)["B"].apply(lambda x: x.cumsum()) expected = Series( [2, 6, 6], name="B", index=pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)]) ) @@ -1737,7 +1735,7 @@ def test_group_shift_with_fill_value(): columns=["A", "B", "Z"], index=None, ) - g = df.groupby(["A", "B"], group_keys=False) + g = df.groupby(["A", "B"]) expected = DataFrame( [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index e5cd90c21f426..0849341f12f8b 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -645,7 +645,7 @@ def test_evaluate_with_empty_groups(self, func, expected): # (not testing other agg fns, because they return # different index objects. df = DataFrame({1: [], 2: []}) - g = df.groupby(1, group_keys=False) + g = df.groupby(1) result = getattr(g[2], func)(lambda x: x) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index a90125494c9ba..a68673fc83467 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -62,18 +62,8 @@ def demean(arr): index=["Joe", "Steve", "Wes", "Jim", "Travis"], ) key = ["one", "two", "one", "two", "one"] - result = ( - people.groupby(key, group_keys=False) - .transform(demean) - .groupby(key, group_keys=False) - .mean() - ) - expected = ( - people.groupby(key, group_keys=False) - .apply(demean) - .groupby(key, group_keys=False) - .mean() - ) + result = people.groupby(key).transform(demean).groupby(key).mean() + expected = people.groupby(key, group_keys=False).apply(demean).groupby(key).mean() tm.assert_frame_equal(result, expected) # GH 8430 @@ -175,14 +165,8 @@ def test_transform_axis_1(transformation_func): args = ("ffill",) if transformation_func == "fillna" else () df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) - result = df.groupby([0, 0, 1], axis=1, group_keys=False).transform( - transformation_func, *args - ) - expected = ( - df.T.groupby([0, 0, 1], group_keys=False) - .transform(transformation_func, *args) - .T - ) + result = df.groupby([0, 0, 1], axis=1).transform(transformation_func, *args) + expected = df.T.groupby([0, 0, 1]).transform(transformation_func, *args).T if transformation_func == "diff": # Result contains nans, so transpose coerces to float @@ -341,7 +325,7 @@ def test_transform_multiple(ts): def test_dispatch_transform(tsframe): df = tsframe[::5].reindex(tsframe.index) - grouped = df.groupby(lambda x: x.month, group_keys=False) + grouped = df.groupby(lambda x: x.month) filled = grouped.fillna(method="pad") fillit = lambda x: x.fillna(method="pad") @@ -375,7 +359,7 @@ def test_transform_transformation_func(transformation_func): test_op = lambda x: x.transform(transformation_func) mock_op = lambda x: getattr(x, transformation_func)() - result = test_op(df.groupby("A", group_keys=False)) + result = test_op(df.groupby("A")) groups = [df[["B"]].iloc[:4], df[["B"]].iloc[4:6], df[["B"]].iloc[6:]] expected = concat([mock_op(g) for g in groups]) diff --git a/pandas/tests/series/apply/test_series_transform.py b/pandas/tests/series/apply/test_series_transform.py index 702069d7ef721..992aaa540a65f 100644 --- a/pandas/tests/series/apply/test_series_transform.py +++ b/pandas/tests/series/apply/test_series_transform.py @@ -19,7 +19,7 @@ def test_transform_groupby_kernel(string_series, op): args = [0.0] if op == "fillna" else [] ones = np.ones(string_series.shape[0]) - expected = string_series.groupby(ones, group_keys=False).transform(op, *args) + expected = string_series.groupby(ones).transform(op, *args) result = string_series.transform(op, 0, *args) tm.assert_series_equal(result, expected) From c1953e6d97fca2bd832f2d4afda381cc5125f7e9 Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Sun, 3 Jan 2021 15:29:52 -0500 Subject: [PATCH 43/67] Avoid warnings when using apply internally, bare xfail/FutureWarning in tests --- pandas/core/groupby/groupby.py | 31 ++++++++++++++++----------- pandas/tests/groupby/test_apply.py | 11 ++++++---- pandas/tests/groupby/test_groupby.py | 6 ++++-- pandas/tests/groupby/test_grouping.py | 2 +- 4 files changed, 30 insertions(+), 20 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8eb26888b75b0..37f8c0d767152 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2553,7 +2553,8 @@ def cumprod(self, axis=0, *args, **kwargs): """ nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"]) if axis != 0: - return self.apply(lambda x: x.cumprod(axis=axis, **kwargs)) + f = lambda x: x.cumprod(axis=axis, **kwargs) + return self._python_apply_general(f, self._selected_obj, is_transform=True) return self._cython_transform("cumprod", **kwargs) @@ -2570,7 +2571,8 @@ def cumsum(self, axis=0, *args, **kwargs): """ nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"]) if axis != 0: - return self.apply(lambda x: x.cumsum(axis=axis, **kwargs)) + f = lambda x: x.cumsum(axis=axis, **kwargs) + return self._python_apply_general(f, self._selected_obj, is_transform=True) return self._cython_transform("cumsum", **kwargs) @@ -2586,7 +2588,8 @@ def cummin(self, axis=0, **kwargs): Series or DataFrame """ if axis != 0: - return self.apply(lambda x: np.minimum.accumulate(x, axis)) + f = lambda x: np.minimum.accumulate(x, axis) + return self._python_apply_general(f, self._selected_obj, is_transform=True) return self._cython_transform("cummin", numeric_only=False) @@ -2602,7 +2605,8 @@ def cummax(self, axis=0, **kwargs): Series or DataFrame """ if axis != 0: - return self.apply(lambda x: np.maximum.accumulate(x, axis)) + f = lambda x: np.maximum.accumulate(x, axis) + return self._python_apply_general(f, self._selected_obj, is_transform=True) return self._cython_transform("cummax", numeric_only=False) @@ -2798,7 +2802,8 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): if available. """ if freq is not None or axis != 0 or not isna(fill_value): - return self.apply(lambda x: x.shift(periods, freq, axis, fill_value)) + f = lambda x: x.shift(periods, freq, axis, fill_value) + return self._python_apply_general(f, self._selected_obj, is_transform=True) return self._get_cythonized_result( "group_shift_indexer", @@ -2822,15 +2827,15 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0 Percentage changes within each group. """ if freq is not None or axis != 0: - return self.apply( - lambda x: x.pct_change( - periods=periods, - fill_method=fill_method, - limit=limit, - freq=freq, - axis=axis, - ) + f = lambda x: x.pct_change( + periods=periods, + fill_method=fill_method, + limit=limit, + freq=freq, + axis=axis, ) + return self._python_apply_general(f, self._selected_obj, is_transform=True) + if fill_method is None: # GH30463 fill_method = "pad" limit = 0 diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index cc54001eda453..9af16d5d2c48e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1029,23 +1029,26 @@ def test_apply_result_type(group_keys, udf, is_transform): def test_groupby_apply_group_keys_warns(): df = DataFrame({"A": [0, 1, 1], "B": [1, 2, 3]}) - with tm.assert_produces_warning(FutureWarning): + msg = "Not prepending group keys to the result index" + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A").apply(lambda x: x) tm.assert_frame_equal(result, df) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A")["B"].apply(lambda x: x) tm.assert_series_equal(result, df["B"]) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df["B"].groupby(df["A"]).apply(lambda x: x) tm.assert_series_equal(result, df["B"]) -@pytest.mark.xfail("BinGrouper and Grouper aren't consistent with NA key handling") +@pytest.mark.xfail( + reason="BinGrouper and Grouper aren't consistent with NA key handling" +) def test_resample_with_only_nat(self): # https://github.com/pandas-dev/pandas/issues/35251 pi = pd.PeriodIndex([pd.NaT] * 3, freq="S") diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 62b579bf929ad..48ab9835007e3 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -39,7 +39,7 @@ def test_basic(dtype): np.random.shuffle(index) data = data.reindex(index) - grouped = data.groupby(lambda x: x // 3) + grouped = data.groupby(lambda x: x // 3, group_keys=False) for k, v in grouped: assert len(v) == 3 @@ -608,7 +608,9 @@ def test_as_index_select_column(): expected = Series([2, 4], name="B") tm.assert_series_equal(result, expected) - result = df.groupby("A", as_index=False)["B"].apply(lambda x: x.cumsum()) + result = df.groupby("A", as_index=False, group_keys=True)["B"].apply( + lambda x: x.cumsum() + ) expected = Series( [2, 6, 6], name="B", index=pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)]) ) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 0849341f12f8b..e5cd90c21f426 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -645,7 +645,7 @@ def test_evaluate_with_empty_groups(self, func, expected): # (not testing other agg fns, because they return # different index objects. df = DataFrame({1: [], 2: []}) - g = df.groupby(1) + g = df.groupby(1, group_keys=False) result = getattr(g[2], func)(lambda x: x) tm.assert_series_equal(result, expected) From 74b98e184314b5bbb566c3229e47578223db47d5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 3 Jan 2021 18:15:06 -0500 Subject: [PATCH 44/67] Remove group_keys from DataFrame.resample, parametrize one test --- pandas/core/frame.py | 2 -- pandas/core/resample.py | 4 ++-- pandas/tests/groupby/test_apply.py | 17 ++++++----------- pandas/tests/resample/test_base.py | 2 +- pandas/tests/resample/test_period_index.py | 2 +- pandas/tests/resample/test_resample_api.py | 2 +- pandas/tests/resample/test_resampler_grouper.py | 2 +- 7 files changed, 12 insertions(+), 19 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0a16fa0316a90..443bc721e8d7c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9398,7 +9398,6 @@ def resample( level=None, origin: Union[str, "TimestampConvertibleTypes"] = "start_day", offset: Optional["TimedeltaConvertibleTypes"] = None, - group_keys: bool = no_default, ) -> "Resampler": return super().resample( rule=rule, @@ -9413,7 +9412,6 @@ def resample( level=level, origin=origin, offset=offset, - group_keys=group_keys, ) def to_timestamp( diff --git a/pandas/core/resample.py b/pandas/core/resample.py index a95e8ac2f9005..de41377a336dd 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -83,14 +83,14 @@ class Resampler(BaseGroupBy, ShallowMixin): "offset", ] - def __init__(self, obj, groupby=None, axis=0, kind=None, group_keys=True, **kwargs): + def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.groupby = groupby self.keys = None self.sort = True self.axis = axis self.kind = kind self.squeeze = False - self.group_keys = group_keys + self.group_keys = False self.as_index = True self.exclusions = set() self.binner = None diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 9af16d5d2c48e..3eed184b49506 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -79,7 +79,10 @@ def test_apply_trivial2(): tm.assert_frame_equal(result, expected) -def test_fast_apply(): +@pytest.mark.parametrize( + "f, expected_mutated", [(lambda _: 1, True), (lambda g: g, False)] +) +def test_fast_apply(f, expected_mutated): # make sure that fast apply is correctly called # rather than raising any kind of error # otherwise the python path will be called @@ -96,23 +99,15 @@ def test_fast_apply(): } ) - def f1(g): - return 1 - - def f2(g): - return g - g = df.groupby(["key", "key2"]) - grouper = g.grouper splitter = grouper._get_splitter(g._selected_obj, axis=g.axis) group_keys = grouper._get_group_keys() sdata = splitter._get_sorted_data() - for f, expected_mutated in [(f1, True), (f2, False)]: - values, mutated = splitter.fast_apply(f, sdata, group_keys) - assert mutated is expected_mutated + values, mutated = splitter.fast_apply(f, sdata, group_keys) + assert mutated is expected_mutated @pytest.mark.parametrize( diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 85201108fd22c..85428befbc64c 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -130,7 +130,7 @@ def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): # GH13212 df = empty_frame_dti # count retains dimensions too - result = getattr(df.resample(freq, group_keys=False), resample_method)() + result = getattr(df.resample(freq), resample_method)() if resample_method != "size": expected = df.copy() else: diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index d7df399fee40c..e83196e9c7d56 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -790,7 +790,7 @@ def test_resample_with_only_nat(self): frame = DataFrame([2, 3, 5], index=pi) expected_index = PeriodIndex(data=[], freq=pi.freq) expected = DataFrame(index=expected_index) - result = frame.resample("1s", group_keys=False).mean() + result = frame.resample("1s").mean() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 4e72edd43edec..7fc152bf73f4c 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -305,7 +305,7 @@ def test_apply_without_aggregation(): result = t.apply(lambda x: x) tm.assert_series_equal(result, test_series) - grouped = test_series.to_frame(name="foo").resample("20min", group_keys=False) + grouped = test_series.to_frame(name="foo").resample("20min") result = grouped["foo"].apply(lambda x: x) tm.assert_series_equal(result, test_series.rename("foo")) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 86955ac4e4d22..664f32b0e70bb 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -277,7 +277,7 @@ def f(x): tm.assert_frame_equal(result, expected) # A case for series - expected = df["col1"].groupby(pd.Grouper(freq="M")).apply(f) + expected = df["col1"].groupby(pd.Grouper(freq="M"), group_keys=False).apply(f) result = df["col1"].resample("M").apply(f) tm.assert_series_equal(result, expected) From 20d7663ec2c0227cf59617557e90abf2b4c1f59f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 4 Jan 2021 16:59:34 -0500 Subject: [PATCH 45/67] Doc changes --- doc/source/user_guide/groupby.rst | 36 ++++++++++++++++++++++++++----- pandas/core/frame.py | 15 +++++++------ 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 156a920826847..44d50677f2427 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1037,7 +1037,13 @@ Some operations on the grouped data might not fit into either the aggregate or transform categories. Or, you may simply want GroupBy to infer how to combine the results. For these, use the ``apply`` function, which can be substituted for both ``aggregate`` and ``transform`` in many standard use cases. However, -``apply`` can handle some exceptional use cases, for example: +``apply`` can handle some exceptional use cases. + +.. note:: + + ``apply`` can act as a reducer, transformer, *or* filter function, depending on exactly what is passed to it. + So depending on the path taken, and exactly what you are grouping. Thus the grouped column(s) may be included in + the output as well as set the indices. .. ipython:: python @@ -1051,7 +1057,7 @@ The dimension of the returned result can also change: .. ipython:: python - grouped = df.groupby('A', group_keys=False)['C'] + grouped = df.groupby('A')['C'] def f(group): return pd.DataFrame({'original': group, @@ -1071,13 +1077,33 @@ that is itself a series, and possibly upcast the result to a DataFrame: s s.apply(f) +Control grouped column(s) placement with ``group_keys`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. note:: - ``apply`` can act as a reducer, transformer, *or* filter function, depending on exactly what is passed to it. - So depending on the path taken, and exactly what you are grouping. Thus the grouped columns(s) may be included in - the output as well as set the indices. + If ``group_keys=True`` is specified when calling :meth:`~DataFrame.groupby`, + functions passed to ``apply`` that return like-indexed outputs will have the + group keys added to the result index. Previous versions of pandas would add + the group keys only when the result from the applied function had a different + index than the input. If ``group_keys`` is not specified, the group keys will + not be added for like-indexed outputs. In this future this behavior + will change to always respect ``group_keys``, which defaults to ``True``. + + .. versionchanged:: 1.3.0 + +To control whether the grouped column(s) are included in the indices, you can use +the argument ``group_keys``. Compare + +.. ipython:: python + + df.groupby("A", group_keys=True).apply(lambda x: x) + +with + +.. ipython:: python + df.groupby("A", group_keys=False).apply(lambda x: x) Numba Accelerated Routines -------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 443bc721e8d7c..666653910b188 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6701,17 +6701,11 @@ def update( NaN 12.3 33.0 When using ``.apply()``, use ``group_keys`` to include or exclude the group keys. +The ``group_keys`` argument defaults to ``True`` (include). >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', ... 'Parrot', 'Parrot'], ... 'Max Speed': [380., 370., 24., 26.]}) ->>> df.groupby("Animal", group_keys=False).apply(lambda x: x) - Animal Max Speed -0 Falcon 380.0 -1 Falcon 370.0 -2 Parrot 24.0 -3 Parrot 26.0 - >>> df.groupby("Animal", group_keys=True).apply(lambda x: x) Animal Max Speed Animal @@ -6719,6 +6713,13 @@ def update( 1 Falcon 370.0 Parrot 2 Parrot 24.0 3 Parrot 26.0 + +>>> df.groupby("Animal", group_keys=False).apply(lambda x: x) + Animal Max Speed +0 Falcon 380.0 +1 Falcon 370.0 +2 Parrot 24.0 +3 Parrot 26.0 """ ) @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) From c3bccb39f589ca4d64deb6d8ba147be89d7ec167 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 4 Jan 2021 17:12:57 -0500 Subject: [PATCH 46/67] Remove group_keys from NDFrame.resample --- pandas/core/generic.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 81bccf640978b..fe86bf3f582ca 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7735,7 +7735,6 @@ def resample( level=None, origin: Union[str, TimestampConvertibleTypes] = "start_day", offset: Optional[TimedeltaConvertibleTypes] = None, - group_keys: bool_t = lib.no_default, ) -> Resampler: """ Resample time-series data. @@ -7811,12 +7810,6 @@ def resample( .. versionadded:: 1.1.0 - group_keys : bool, default True - Whether to include the group keys in the result index when performing - a ``.groupby().apply()`` to the resampled object. - - .. versionadded:: 1.3.0 - Returns ------- pandas.core.Resampler @@ -8156,7 +8149,6 @@ def resample( level=level, origin=origin, offset=offset, - group_keys=group_keys, ) @final From 7f7ee525c61fc4f7509fbd9b9e7575b4d78316b2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 4 Jan 2021 17:16:51 -0500 Subject: [PATCH 47/67] Remove group_keys from Series.resample --- pandas/core/series.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 594558cf9f7b1..34073837677f2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4875,7 +4875,6 @@ def resample( level=None, origin: Union[str, "TimestampConvertibleTypes"] = "start_day", offset: Optional["TimedeltaConvertibleTypes"] = None, - group_keys: bool = no_default, ) -> "Resampler": return super().resample( rule=rule, @@ -4890,7 +4889,6 @@ def resample( level=level, origin=origin, offset=offset, - group_keys=group_keys, ) def to_timestamp(self, freq=None, how="start", copy=True) -> "Series": From 4f652a9782a0dc99e92b8d33570d6f5dcec6ba25 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 4 Jan 2021 18:08:18 -0500 Subject: [PATCH 48/67] Fixed _is_indexed_like bug --- pandas/core/groupby/ops.py | 3 +- pandas/tests/groupby/test_apply.py | 11 ++++++++ pandas/tests/resample/test_base.py | 2 +- pandas/tests/resample/test_resample_api.py | 28 +------------------ .../tests/resample/test_resampler_grouper.py | 2 +- 5 files changed, 16 insertions(+), 30 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index dd86b5c1a75b7..26ebe1bbfbaf1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -928,6 +928,7 @@ def agg_series(self, obj: Series, func: F): def _is_indexed_like(obj, axes) -> bool: + """Returns True when all axes of obj equal that of axes.""" if isinstance(obj, Series): if len(axes) > 1: return False @@ -936,7 +937,7 @@ def _is_indexed_like(obj, axes) -> bool: is_frame = isinstance(obj, DataFrame) if is_frame and len(axes) == 1: # The UDF was DataFrame -> Series - return True + return False elif is_frame: # The UDF was DataFrame -> DataFrame return obj.index.equals(axes[0]) and obj.columns.equals(axes[1]) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 3eed184b49506..ac7a17bfd8a13 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -340,6 +340,17 @@ def desc3(group): assert result3.index.names == ("A", "B", None) +def test_apply_series_to_series_new_index(): + # GH 34988 + df = DataFrame({"A": list("xy"), "B": [1, 2]}) + + mi = MultiIndex.from_arrays([list("xy"), [0, 0]], names=("A", None)) + expected = DataFrame({"index": [0, 1]}, index=mi) + + result = df.groupby("A")["B"].apply(lambda x: x.reset_index(drop=False)[["index"]]) + tm.assert_frame_equal(result, expected) + + def test_apply_series_to_frame(): def f(piece): with np.errstate(invalid="ignore"): diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 85428befbc64c..7389fa31109f8 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -186,7 +186,7 @@ def test_resample_empty_dtypes(index, dtype, resample_method): # them to ensure they no longer do. (GH #10228) empty_series_dti = Series([], index, dtype) try: - getattr(empty_series_dti.resample("d", group_keys=False), resample_method)() + getattr(empty_series_dti.resample("d"), resample_method)() except DataError: # Ignore these since some combinations are invalid # (ex: doing mean with dtype of np.object) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 7fc152bf73f4c..f8f1eeb3208f7 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -90,32 +90,6 @@ def test_groupby_resample_on_api(): tm.assert_frame_equal(result, expected) -def test_resample_group_keys(): - df = DataFrame({"A": 1, "B": 2}, index=pd.date_range("2000", periods=10)) - g = df.resample("5D") - expected = df.copy() - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # stacklevel is set for groupby, not resample - result = g.apply(lambda x: x) - tm.assert_frame_equal(result, expected) - - # no warning - g = df.resample("5D", group_keys=False) - with tm.assert_produces_warning(None): - result = g.apply(lambda x: x) - tm.assert_frame_equal(result, expected) - - # no warning, group keys - expected.index = pd.MultiIndex.from_arrays( - [pd.to_datetime(["2000-01-01", "2000-01-06"]).repeat(5), expected.index] - ) - - g = df.resample("5D", group_keys=True) - with tm.assert_produces_warning(None): - result = g.apply(lambda x: x) - tm.assert_frame_equal(result, expected) - - def test_pipe(test_frame): # GH17905 @@ -298,7 +272,7 @@ def test_fillna(): def test_apply_without_aggregation(): # both resample and groupby should work w/o aggregation - r = test_series.resample("20min", group_keys=False) + r = test_series.resample("20min") g = test_series.groupby(pd.Grouper(freq="20min"), group_keys=False) for t in [g, r]: diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 664f32b0e70bb..f982581b76b35 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -271,7 +271,7 @@ def f(x): s = Series([1, 2], index=["a", "b"]) return s - expected = df.groupby(pd.Grouper(freq="M")).apply(f) + expected = df.groupby(pd.Grouper(freq="M"), group_keys=False).apply(f) result = df.resample("M").apply(f) tm.assert_frame_equal(result, expected) From 499ecd422abd52f61ea0d41d279ac197bb4c4957 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 4 Jan 2021 18:12:59 -0500 Subject: [PATCH 49/67] Minor fixups --- doc/source/user_guide/groupby.rst | 2 +- pandas/tests/groupby/test_apply.py | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 44d50677f2427..523f2c13ac9dc 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1087,7 +1087,7 @@ Control grouped column(s) placement with ``group_keys`` group keys added to the result index. Previous versions of pandas would add the group keys only when the result from the applied function had a different index than the input. If ``group_keys`` is not specified, the group keys will - not be added for like-indexed outputs. In this future this behavior + not be added for like-indexed outputs. In the future this behavior will change to always respect ``group_keys``, which defaults to ``True``. .. versionchanged:: 1.3.0 diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index ac7a17bfd8a13..8ab37721049df 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -340,17 +340,6 @@ def desc3(group): assert result3.index.names == ("A", "B", None) -def test_apply_series_to_series_new_index(): - # GH 34988 - df = DataFrame({"A": list("xy"), "B": [1, 2]}) - - mi = MultiIndex.from_arrays([list("xy"), [0, 0]], names=("A", None)) - expected = DataFrame({"index": [0, 1]}, index=mi) - - result = df.groupby("A")["B"].apply(lambda x: x.reset_index(drop=False)[["index"]]) - tm.assert_frame_equal(result, expected) - - def test_apply_series_to_frame(): def f(piece): with np.errstate(invalid="ignore"): @@ -369,6 +358,17 @@ def f(piece): tm.assert_index_equal(result.index, ts.index) +def test_apply_series_to_frame_new_index(): + # GH 34988 + df = DataFrame({"A": list("xy"), "B": [1, 2]}) + + mi = MultiIndex.from_arrays([list("xy"), [0, 0]], names=("A", None)) + expected = DataFrame({"index": [0, 1]}, index=mi) + + result = df.groupby("A")["B"].apply(lambda x: x.reset_index(drop=False)[["index"]]) + tm.assert_frame_equal(result, expected) + + def test_apply_series_yield_constant(df): result = df.groupby(["A", "B"])["C"].apply(len) assert result.index.names[:2] == ("A", "B") From d1f2d29c7427a3d07e8bf307cdec4e78b66e2cd4 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 4 Jan 2021 18:15:54 -0500 Subject: [PATCH 50/67] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 082e078645279..aac3b672f6a73 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -158,8 +158,12 @@ Deprecations :meth:`~DataFrame.groupby` no longer ignores ``group_keys`` for transform-like ``apply`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:meth:`~DataFrame.groupby` will no longer ignore the ``group_keys`` argument for functions passed to ``apply`` that return like-indexed outputs (:issue:`34809`, :issue:`31612`, :issue:`14927`, :issue:`13056`). -Previous versions of pandas would add the group keys only when the result from the applied function had a different index to the input. +If ``group_keys=True`` is specified when calling :meth:`~DataFrame.groupby`, +functions passed to ``apply`` that return like-indexed outputs will have the +group keys added to the result index. Previous versions of pandas would add +the group keys only when the result from the applied function had a different +index than the input. If ``group_keys`` is not specified, the group keys will +not be added for like-indexed outputs. *Previous behavior*: From 45cd9807148f3e26551936efd58d7317fc7eb133 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 9 Jan 2021 08:32:19 -0500 Subject: [PATCH 51/67] Restored group_keys in resample, defaults to no_default --- pandas/core/frame.py | 2 ++ pandas/core/generic.py | 8 +++++ pandas/core/resample.py | 6 ++-- pandas/core/series.py | 2 ++ pandas/tests/resample/test_base.py | 4 +-- pandas/tests/resample/test_period_index.py | 2 +- pandas/tests/resample/test_resample_api.py | 30 +++++++++++++++++-- .../tests/resample/test_resampler_grouper.py | 4 +-- 8 files changed, 49 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 666653910b188..d5ead11ae0f07 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9399,6 +9399,7 @@ def resample( level=None, origin: Union[str, "TimestampConvertibleTypes"] = "start_day", offset: Optional["TimedeltaConvertibleTypes"] = None, + group_keys: bool = no_default, ) -> "Resampler": return super().resample( rule=rule, @@ -9413,6 +9414,7 @@ def resample( level=level, origin=origin, offset=offset, + group_keys=group_keys, ) def to_timestamp( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fe86bf3f582ca..47902f2df50bf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7735,6 +7735,7 @@ def resample( level=None, origin: Union[str, TimestampConvertibleTypes] = "start_day", offset: Optional[TimedeltaConvertibleTypes] = None, + group_keys: bool_t = lib.no_default, ) -> Resampler: """ Resample time-series data. @@ -7810,6 +7811,12 @@ def resample( .. versionadded:: 1.1.0 + group_keys : bool, optional + Whether to include the group keys in the result index when performing + a ``.groupby().apply()`` to the resampled object. + + .. versionadded:: 1.3.0 + Returns ------- pandas.core.Resampler @@ -8149,6 +8156,7 @@ def resample( level=level, origin=origin, offset=offset, + group_keys=group_keys, ) @final diff --git a/pandas/core/resample.py b/pandas/core/resample.py index de41377a336dd..fbead0e228e0f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -83,14 +83,16 @@ class Resampler(BaseGroupBy, ShallowMixin): "offset", ] - def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): + def __init__( + self, obj, groupby=None, axis=0, kind=None, group_keys=lib.no_default, **kwargs + ): self.groupby = groupby self.keys = None self.sort = True self.axis = axis self.kind = kind self.squeeze = False - self.group_keys = False + self.group_keys = group_keys self.as_index = True self.exclusions = set() self.binner = None diff --git a/pandas/core/series.py b/pandas/core/series.py index 34073837677f2..594558cf9f7b1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4875,6 +4875,7 @@ def resample( level=None, origin: Union[str, "TimestampConvertibleTypes"] = "start_day", offset: Optional["TimedeltaConvertibleTypes"] = None, + group_keys: bool = no_default, ) -> "Resampler": return super().resample( rule=rule, @@ -4889,6 +4890,7 @@ def resample( level=level, origin=origin, offset=offset, + group_keys=group_keys, ) def to_timestamp(self, freq=None, how="start", copy=True) -> "Series": diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 7389fa31109f8..85201108fd22c 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -130,7 +130,7 @@ def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): # GH13212 df = empty_frame_dti # count retains dimensions too - result = getattr(df.resample(freq), resample_method)() + result = getattr(df.resample(freq, group_keys=False), resample_method)() if resample_method != "size": expected = df.copy() else: @@ -186,7 +186,7 @@ def test_resample_empty_dtypes(index, dtype, resample_method): # them to ensure they no longer do. (GH #10228) empty_series_dti = Series([], index, dtype) try: - getattr(empty_series_dti.resample("d"), resample_method)() + getattr(empty_series_dti.resample("d", group_keys=False), resample_method)() except DataError: # Ignore these since some combinations are invalid # (ex: doing mean with dtype of np.object) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index e83196e9c7d56..d7df399fee40c 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -790,7 +790,7 @@ def test_resample_with_only_nat(self): frame = DataFrame([2, 3, 5], index=pi) expected_index = PeriodIndex(data=[], freq=pi.freq) expected = DataFrame(index=expected_index) - result = frame.resample("1s").mean() + result = frame.resample("1s", group_keys=False).mean() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index f8f1eeb3208f7..4e72edd43edec 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -90,6 +90,32 @@ def test_groupby_resample_on_api(): tm.assert_frame_equal(result, expected) +def test_resample_group_keys(): + df = DataFrame({"A": 1, "B": 2}, index=pd.date_range("2000", periods=10)) + g = df.resample("5D") + expected = df.copy() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # stacklevel is set for groupby, not resample + result = g.apply(lambda x: x) + tm.assert_frame_equal(result, expected) + + # no warning + g = df.resample("5D", group_keys=False) + with tm.assert_produces_warning(None): + result = g.apply(lambda x: x) + tm.assert_frame_equal(result, expected) + + # no warning, group keys + expected.index = pd.MultiIndex.from_arrays( + [pd.to_datetime(["2000-01-01", "2000-01-06"]).repeat(5), expected.index] + ) + + g = df.resample("5D", group_keys=True) + with tm.assert_produces_warning(None): + result = g.apply(lambda x: x) + tm.assert_frame_equal(result, expected) + + def test_pipe(test_frame): # GH17905 @@ -272,14 +298,14 @@ def test_fillna(): def test_apply_without_aggregation(): # both resample and groupby should work w/o aggregation - r = test_series.resample("20min") + r = test_series.resample("20min", group_keys=False) g = test_series.groupby(pd.Grouper(freq="20min"), group_keys=False) for t in [g, r]: result = t.apply(lambda x: x) tm.assert_series_equal(result, test_series) - grouped = test_series.to_frame(name="foo").resample("20min") + grouped = test_series.to_frame(name="foo").resample("20min", group_keys=False) result = grouped["foo"].apply(lambda x: x) tm.assert_series_equal(result, test_series.rename("foo")) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index f982581b76b35..86955ac4e4d22 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -271,13 +271,13 @@ def f(x): s = Series([1, 2], index=["a", "b"]) return s - expected = df.groupby(pd.Grouper(freq="M"), group_keys=False).apply(f) + expected = df.groupby(pd.Grouper(freq="M")).apply(f) result = df.resample("M").apply(f) tm.assert_frame_equal(result, expected) # A case for series - expected = df["col1"].groupby(pd.Grouper(freq="M"), group_keys=False).apply(f) + expected = df["col1"].groupby(pd.Grouper(freq="M")).apply(f) result = df["col1"].resample("M").apply(f) tm.assert_series_equal(result, expected) From 804312afa7e6577df867aa697314675ed94d9017 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 9 Jan 2021 09:16:37 -0500 Subject: [PATCH 52/67] Fixed stacklevel for resample warning, simplified test --- pandas/core/groupby/groupby.py | 6 +++++- pandas/tests/groupby/test_apply.py | 6 +++--- pandas/tests/resample/test_resample_api.py | 3 +-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 37f8c0d767152..ebd5909a78a0b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -11,6 +11,7 @@ class providing the base-class of operations. import datetime from functools import partial, wraps import inspect +import os from textwrap import dedent import types from typing import ( @@ -965,7 +966,10 @@ def _python_apply_general( ): # We've detected value-dependent behavior: the result's index depends on # whether the user's function `f` returned the same index or not. - if self.ndim == 1: + caller = inspect.stack()[2] + if caller.filename.endswith(os.path.join("pandas", "core", "resample.py")): + stacklevel = 5 + elif self.ndim == 1: stacklevel = 4 elif self._selection is None: stacklevel = 3 diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 8ab37721049df..981ffeaf5f62a 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1013,11 +1013,11 @@ def test_apply_function_with_indexing_return_column(): @pytest.mark.parametrize( - "udf, is_transform", - [(lambda x: x.copy(), True), (lambda x: x.copy().rename(lambda y: y + 1), False)], + "udf", + [(lambda x: x.copy()), (lambda x: x.copy().rename(lambda y: y + 1))], ) @pytest.mark.parametrize("group_keys", [True, False]) -def test_apply_result_type(group_keys, udf, is_transform): +def test_apply_result_type(group_keys, udf): # https://github.com/pandas-dev/pandas/issues/34809 # We'd like to control whether the group keys end up in the index # regardless of whether the UDF happens to be a transform. diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 4e209c855dd25..fbf660bc40e8c 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -94,8 +94,7 @@ def test_resample_group_keys(): df = DataFrame({"A": 1, "B": 2}, index=pd.date_range("2000", periods=10)) g = df.resample("5D") expected = df.copy() - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # stacklevel is set for groupby, not resample + with tm.assert_produces_warning(FutureWarning, match="Not prepending group keys"): result = g.apply(lambda x: x) tm.assert_frame_equal(result, expected) From a5ce219fecfecc2b17f2c5ce4f1f57ba8e4e21c4 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 12 Jan 2021 18:39:30 -0500 Subject: [PATCH 53/67] whatsnew and docstring for DataFrame.resample --- doc/source/whatsnew/v1.3.0.rst | 55 ++++++++++++++++++++++++++++++++-- pandas/core/generic.py | 9 ++++-- 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index c2543426a3454..2ba50a1802d52 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -33,12 +33,63 @@ For example: storage_options=headers ) + + .. _whatsnew_130.window_method_table: :class:`Rolling` and :class:`Expanding` now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See ref:`window.overview` for performance and functional benefits. (:issue:`15095`, :issue:`38995`) +.. _whatsnew_130.resample_group_keys: + +Control of index with ``group_keys`` in :meth:`DataFrame.resample` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The argument ``group_keys`` has been added to the method :meth:`DataFrame.resample`. +As with :meth:`DataFrame.groupby`, this argument controls the whether each group is added +to the index in the resample when :meth:`.Resampler.apply` is used. + +.. warning:: + Not specifying the ``group_keys`` argument will retain the + previous behavior and emit a warning. In a future version + of pandas, not specifying ``group_keys`` will default to + the same behavior as ``group_keys=False``. + +.. ipython:: python + + df = pd.DataFrame( + {'a': range(6)}, + index=pd.date_range("2021-01-01", periods=6, freq="8H") + ) + df.resample("D", group_keys=True).apply(lambda x: x) + df.resample("D", group_keys=False).apply(lambda x: x) + +Previously, the resulting index would depend upon the values returned by ``apply``, +as seen in the following example. + +.. code-block:: python + + >>> # pandas 1.2 + >>> df.resample("D").apply(lambda x: x) + a + 2021-01-01 00:00:00 0 + 2021-01-01 08:00:00 1 + 2021-01-01 16:00:00 2 + 2021-01-02 00:00:00 3 + 2021-01-02 08:00:00 4 + 2021-01-02 16:00:00 5 + >>> df.resample("D").apply(lambda x: x.reset_index()) + index a + 2021-01-01 0 2021-01-01 00:00:00 0 + 1 2021-01-01 08:00:00 1 + 2 2021-01-01 16:00:00 2 + 2021-01-02 0 2021-01-02 00:00:00 3 + 1 2021-01-02 08:00:00 4 + 2 2021-01-02 16:00:00 5 + + + .. _whatsnew_130.enhancements.other: Other enhancements @@ -158,8 +209,8 @@ Other API changes Deprecations ~~~~~~~~~~~~ -:meth:`~DataFrame.groupby` no longer ignores ``group_keys`` for transform-like ``apply`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:meth:`~DataFrame.groupby` no longer ignore ``group_keys`` for transform-like ``apply`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ If ``group_keys=True`` is specified when calling :meth:`~DataFrame.groupby`, functions passed to ``apply`` that return like-indexed outputs will have the diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a94b3e2a4af17..01946d4ed8728 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7809,8 +7809,13 @@ def resample( .. versionadded:: 1.1.0 group_keys : bool, optional - Whether to include the group keys in the result index when performing - a ``.groupby().apply()`` to the resampled object. + Whether to include the group keys in the result index when using + ``.apply()`` on the resampled object. Not specifying ``group_keys`` + will retain values-dependent behavior from pandas 1.2 + and earlier (see + :ref:`pandas 1.3.0 Release notes` + for examples). In a future version of pandas, the behavior will + default to the same as specifying ``group_keys=False``. .. versionadded:: 1.3.0 From 15b15560e3a661019ef0184ecbf80df6e02b0295 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 12 Jan 2021 18:41:21 -0500 Subject: [PATCH 54/67] Revert accidental changes --- doc/source/whatsnew/v1.3.0.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 2ba50a1802d52..71a0dce1ee734 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -33,8 +33,6 @@ For example: storage_options=headers ) - - .. _whatsnew_130.window_method_table: :class:`Rolling` and :class:`Expanding` now support a ``method`` argument with a @@ -209,8 +207,8 @@ Other API changes Deprecations ~~~~~~~~~~~~ -:meth:`~DataFrame.groupby` no longer ignore ``group_keys`` for transform-like ``apply`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:meth:`~DataFrame.groupby` no longer ignores ``group_keys`` for transform-like ``apply`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ If ``group_keys=True`` is specified when calling :meth:`~DataFrame.groupby`, functions passed to ``apply`` that return like-indexed outputs will have the From f80c4c0c6be6676881e328e760d3481008b1a4c0 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 26 Jan 2021 17:25:53 -0500 Subject: [PATCH 55/67] Removed unnecessary group_keys in tests --- pandas/tests/groupby/test_function.py | 4 +--- pandas/tests/resample/test_period_index.py | 2 +- pandas/tests/window/test_groupby.py | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 74cfa6ee745b7..1d62123ff9af5 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -971,9 +971,7 @@ def test_frame_describe_multikey(tsframe): expected = pd.concat(desc_groups, axis=1) tm.assert_frame_equal(result, expected) - groupedT = tsframe.groupby( - {"A": 0, "B": 0, "C": 1, "D": 1}, axis=1, group_keys=True - ) + groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) result = groupedT.describe() expected = tsframe.describe().T # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 8f7482c339893..2fe3fb91768e6 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -790,7 +790,7 @@ def test_resample_with_only_nat(self): frame = DataFrame([2, 3, 5], index=pi, columns=["a"]) expected_index = PeriodIndex(data=[], freq=pi.freq) expected = DataFrame(index=expected_index, columns=["a"], dtype="int64") - result = frame.resample("1s", group_keys=False).mean() + result = frame.resample("1s").mean() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 750581f834ddf..0f5bf76ae98bf 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -720,7 +720,7 @@ class TestEWM: def test_methods(self, method, expected_data): # GH 16037 df = DataFrame({"A": ["a"] * 4, "B": range(4)}) - result = getattr(df.groupby("A", group_keys=True).ewm(com=1.0), method)() + result = getattr(df.groupby("A").ewm(com=1.0), method)() expected = DataFrame( {"B": expected_data}, index=MultiIndex.from_tuples( From 62c42e8703f0404ed8cd325cba1871685009d29a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 26 Jan 2021 18:45:13 -0500 Subject: [PATCH 56/67] Removed unnecessary xfail, testing equals instead of is --- pandas/_libs/reduction.pyx | 2 +- pandas/tests/extension/test_numpy.py | 12 +----------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 2ff3e8db3ed9c..5a54b96708978 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -366,7 +366,7 @@ def apply_frame_axis0(object frame, object f, object names, # Need to infer if low level index slider will cause segfaults require_slow_apply = i == 0 and piece is chunk try: - if not piece.index is chunk.index: + if not piece.index.equals(chunk.index): mutated = True except AttributeError: # `piece` might not have an index, could be e.g. an int diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 07b574af2ef62..ff34b2d3256bf 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -194,17 +194,7 @@ def test_take_series(self, data): class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): - def test_groupby_extension_apply( - self, data_for_grouping, groupby_apply_op, request - ): - dummy = groupby_apply_op([None]) - if ( - isinstance(dummy, pd.Series) - and data_for_grouping.dtype.numpy_dtype == object - ): - mark = pytest.mark.xfail(reason="raises in MultiIndex construction") - request.node.add_marker(mark) - super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) + pass class TestInterface(BaseNumPyTests, base.BaseInterfaceTests): From bf4b126012697aece524bb29f03256e55ce284ab Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 26 Jan 2021 21:43:20 -0500 Subject: [PATCH 57/67] Revert is -> eqauls change --- pandas/_libs/reduction.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 5a54b96708978..2ff3e8db3ed9c 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -366,7 +366,7 @@ def apply_frame_axis0(object frame, object f, object names, # Need to infer if low level index slider will cause segfaults require_slow_apply = i == 0 and piece is chunk try: - if not piece.index.equals(chunk.index): + if not piece.index is chunk.index: mutated = True except AttributeError: # `piece` might not have an index, could be e.g. an int From 2b02b6f1b62e8905fc85e85c0a8096ca0e2b90b8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 13 Nov 2021 19:46:23 -0500 Subject: [PATCH 58/67] fixups --- pandas/core/groupby/groupby.py | 73 +++++++++++++++++------------ pandas/tests/apply/test_str.py | 10 ++-- pandas/tests/window/test_groupby.py | 7 +-- 3 files changed, 54 insertions(+), 36 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 30c1d38dcc315..088384aafc65b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1005,7 +1005,12 @@ def _iterate_slices(self) -> Iterable[Series]: # Dispatch/Wrapping @final - def _concat_objects(self, values, not_indexed_same: bool = False): + def _concat_objects( + self, + values, + not_indexed_same: bool = False, + override_group_keys: bool = False, + ): from pandas.core.reshape.concat import concat def reset_identity(values): @@ -1016,28 +1021,7 @@ def reset_identity(values): ax._reset_identity() return values - if not not_indexed_same: - result = concat(values, axis=self.axis) - - ax = self._selected_obj._get_axis(self.axis) - if self.dropna: - labels = self.grouper.group_info[0] - mask = labels != -1 - ax = ax[mask] - - # this is a very unfortunate situation - # we can't use reindex to restore the original order - # when the ax has duplicates - # so we resort to this - # GH 14776, 30667 - if ax.has_duplicates and not result.axes[self.axis].equals(ax): - indexer, _ = result.index.get_indexer_non_unique(ax._values) - indexer = algorithms.unique1d(indexer) - result = result.take(indexer, axis=self.axis) - else: - result = result.reindex(ax, axis=self.axis, copy=False) - - elif self.group_keys: + if self.group_keys and not override_group_keys: values = reset_identity(values) if self.as_index: @@ -1061,6 +1045,28 @@ def reset_identity(values): # range index keys = list(range(len(values))) result = concat(values, axis=self.axis, keys=keys) + + elif not not_indexed_same: + result = concat(values, axis=self.axis) + + ax = self._selected_obj._get_axis(self.axis) + if self.dropna: + labels = self.grouper.group_info[0] + mask = labels != -1 + ax = ax[mask] + + # this is a very unfortunate situation + # we can't use reindex to restore the original order + # when the ax has duplicates + # so we resort to this + # GH 14776, 30667 + if ax.has_duplicates and not result.axes[self.axis].equals(ax): + indexer, _ = result.index.get_indexer_non_unique(ax._values) + indexer = algorithms.unique1d(indexer) + result = result.take(indexer, axis=self.axis) + else: + result = result.reindex(ax, axis=self.axis, copy=False) + else: values = reset_identity(values) result = concat(values, axis=self.axis) @@ -1193,7 +1199,13 @@ def _wrap_transformed_output( result.index = self.obj.index return result - def _wrap_applied_output(self, data, values: list, not_indexed_same: bool = False): + def _wrap_applied_output( + self, + data, + values: list, + not_indexed_same: bool = False, + override_group_keys: bool = False, + ): raise AbstractMethodError(self) def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: @@ -1454,8 +1466,9 @@ def _python_apply_general( Series or DataFrame data after applying f """ - keys, values, mutated = self.grouper.apply(f, data, self.axis) - not_indexed_same = mutated or self.mutated + values, mutated = self.grouper.apply(f, data, self.axis) + if not_indexed_same is None: + not_indexed_same = mutated or self.mutated override_group_keys = False if (not not_indexed_same and self.group_keys is lib.no_default) and not ( @@ -1490,11 +1503,11 @@ def _python_apply_general( # may be removed. override_group_keys = True - if not_indexed_same is None: - not_indexed_same = mutated or self.mutated - return self._wrap_applied_output( - data, values, not_indexed_same, override_group_keys + data, + values, + not_indexed_same, + override_group_keys=is_transform or override_group_keys, ) @final diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 6c6b674ef6aab..fbb4749bf92ab 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -248,7 +248,7 @@ def test_transform_groupby_kernel_series(string_series, op): args = [0.0] if op == "fillna" else [] ones = np.ones(string_series.shape[0]) - expected = string_series.groupby(ones).transform(op, *args) + expected = string_series.groupby(ones, group_keys=False).transform(op, *args) result = string_series.transform(op, 0, *args) tm.assert_series_equal(result, expected) @@ -271,7 +271,9 @@ def test_transform_groupby_kernel_frame( ones = np.ones(float_frame.shape[0]) else: ones = np.ones(float_frame.shape[1]) - expected = float_frame.groupby(ones, axis=axis).transform(op, *args) + expected = float_frame.groupby(ones, axis=axis, group_keys=False).transform( + op, *args + ) result = float_frame.transform(op, axis, *args) tm.assert_frame_equal(result, expected) @@ -284,7 +286,9 @@ def test_transform_groupby_kernel_frame( ones = np.ones(float_frame.shape[0]) else: ones = np.ones(float_frame.shape[1]) - expected2 = float_frame.groupby(ones, axis=axis).transform(op, *args) + expected2 = float_frame.groupby(ones, axis=axis, group_keys=False).transform( + op, *args + ) result2 = float_frame.transform(op, axis, *args) tm.assert_frame_equal(result2, expected2) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index ff7289ae029d2..d42f9cb87e770 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1039,9 +1039,10 @@ def test_times_vs_apply(self, times_frame): # GH#42738 result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() expected = ( - times_frame.groupby("A") - .apply(lambda x: x.ewm(halflife=halflife, times="C").mean()) - .iloc[[0, 3, 6, 9, 1, 4, 7, 2, 5, 8]] + times_frame.groupby("A", group_keys=False).apply( + lambda x: x.ewm(halflife=halflife, times="C").mean() + ) + # .iloc[[0, 3, 6, 9, 1, 4, 7, 2, 5, 8]] .reset_index(drop=True) ) tm.assert_frame_equal(result.reset_index(drop=True), expected) From 3e175f9a3b62a423dc5d88172280c514d2ae5291 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 13 Nov 2021 21:34:29 -0500 Subject: [PATCH 59/67] Test fixup --- pandas/tests/window/test_groupby.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index d42f9cb87e770..981a9683a97bd 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1038,14 +1038,10 @@ def test_times_vs_apply(self, times_frame): with tm.assert_produces_warning(FutureWarning, match="nuisance"): # GH#42738 result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() - expected = ( - times_frame.groupby("A", group_keys=False).apply( - lambda x: x.ewm(halflife=halflife, times="C").mean() - ) - # .iloc[[0, 3, 6, 9, 1, 4, 7, 2, 5, 8]] - .reset_index(drop=True) + expected = times_frame.groupby("A", group_keys=True).apply( + lambda x: x.ewm(halflife=halflife, times="C").mean() ) - tm.assert_frame_equal(result.reset_index(drop=True), expected) + tm.assert_frame_equal(result, expected) def test_times_array(self, times_frame): # GH 40951 From 7f3cc48420b993b33a7b6bd9d3d8c6372819850b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 13 Nov 2021 21:58:50 -0500 Subject: [PATCH 60/67] Update verison to 1.4.0, use find_stack_level --- doc/source/user_guide/groupby.rst | 2 +- pandas/core/generic.py | 4 ++-- pandas/core/groupby/groupby.py | 12 +----------- pandas/core/shared_docs.py | 4 +--- 4 files changed, 5 insertions(+), 17 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 9c22bbe30d823..abc9a0843eeb1 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1105,7 +1105,7 @@ Control grouped column(s) placement with ``group_keys`` not be added for like-indexed outputs. In the future this behavior will change to always respect ``group_keys``, which defaults to ``True``. - .. versionchanged:: 1.3.0 + .. versionchanged:: 1.4.0 To control whether the grouped column(s) are included in the indices, you can use the argument ``group_keys``. Compare diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 57dfc16ced421..b26ddce619ae6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7899,11 +7899,11 @@ def resample( ``.apply()`` on the resampled object. Not specifying ``group_keys`` will retain values-dependent behavior from pandas 1.2 and earlier (see - :ref:`pandas 1.3.0 Release notes` + :ref:`pandas 1.4.0 Release notes` for examples). In a future version of pandas, the behavior will default to the same as specifying ``group_keys=False``. - .. versionadded:: 1.3.0 + .. versionadded:: 1.4.0 Returns ------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 088384aafc65b..e08288d3dc56f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -15,7 +15,6 @@ class providing the base-class of operations. wraps, ) import inspect -import os from textwrap import dedent import types from typing import ( @@ -1476,15 +1475,6 @@ def _python_apply_general( ): # We've detected value-dependent behavior: the result's index depends on # whether the user's function `f` returned the same index or not. - caller = inspect.stack()[2] - if caller.filename.endswith(os.path.join("pandas", "core", "resample.py")): - stacklevel = 5 - elif self.ndim == 1: - stacklevel = 4 - elif self._selection is None: - stacklevel = 3 - else: - stacklevel = 4 msg = ( "Not prepending group keys to the result index of " "transform-like apply. In the future, the group keys " @@ -1495,7 +1485,7 @@ def _python_apply_general( "To adopt the future behavior and silence this warning, use " "\n\n\t>>> .groupby(..., group_keys=True)" ) - warnings.warn(msg, FutureWarning, stacklevel=stacklevel) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) # We want to behave as if `self.group_keys=False` when reconstructing # the object. However, we don't want to mutate the stateful GroupBy # object, so we just override it. diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 3d7a91180d317..e01e5916a577c 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -116,7 +116,7 @@ By default group keys are not included when the result's index (and column) labels match the inputs, and are included otherwise. - .. versionchanged:: 1.3.0 + .. versionchanged:: 1.4.0 Warns that `group_keys` will no longer be ignored when the result from ``apply`` is a like-indexed Series or DataFrame. @@ -132,8 +132,6 @@ This only applies if any of the groupers are Categoricals. If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. - - .. versionadded:: 0.23.0 dropna : bool, default True If True, and if group keys contain NA values, NA values together with row/column will be dropped. From 0c0b2c543476404eea59d7fa2601ad79dcd707ee Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 13 Nov 2021 22:36:05 -0500 Subject: [PATCH 61/67] Cleanups --- pandas/core/generic.py | 2 +- pandas/core/groupby/groupby.py | 6 ++++- pandas/core/groupby/ops.py | 17 ++++---------- pandas/tests/apply/test_str.py | 10 +++----- pandas/tests/extension/test_numpy.py | 3 ++- pandas/tests/generic/test_finalize.py | 2 +- .../tests/groupby/aggregate/test_aggregate.py | 4 ++-- pandas/tests/groupby/test_apply.py | 23 ------------------- pandas/tests/groupby/test_function.py | 2 +- pandas/tests/groupby/test_grouping.py | 2 +- 10 files changed, 21 insertions(+), 50 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b26ddce619ae6..dd97f294b3a0f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7897,7 +7897,7 @@ def resample( group_keys : bool, optional Whether to include the group keys in the result index when using ``.apply()`` on the resampled object. Not specifying ``group_keys`` - will retain values-dependent behavior from pandas 1.2 + will retain values-dependent behavior from pandas 1.3 and earlier (see :ref:`pandas 1.4.0 Release notes` for examples). In a future version of pandas, the behavior will diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e08288d3dc56f..73c6e616ca797 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2272,7 +2272,11 @@ def ohlc(self) -> DataFrame: @doc(DataFrame.describe) def describe(self, **kwargs): with self._group_selection_context(): - result = self.apply(lambda x: x.describe(**kwargs)) + result = self._python_apply_general( + lambda x: x.describe(**kwargs), + self._selected_obj, + not_indexed_same=True, + ) if self.axis == 1: return result.T return result.unstack() diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b654443ef73a4..60c8851f059fe 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -749,7 +749,7 @@ def apply( # group might be modified group_axes = group.axes res = f(group) - if not mutated and not _is_indexed_like(res, group_axes): + if not mutated and not _is_indexed_like(res, group_axes, axis): mutated = True result_values.append(res) @@ -1167,20 +1167,13 @@ def _aggregate_series_fast(self, obj: Series, func: Callable) -> np.ndarray: ) -def _is_indexed_like(obj, axes) -> bool: - """Returns True when all axes of obj equal that of axes.""" +def _is_indexed_like(obj, axes, axis: int) -> bool: if isinstance(obj, Series): if len(axes) > 1: return False - return obj.index.equals(axes[0]) - - is_frame = isinstance(obj, DataFrame) - if is_frame and len(axes) == 1: - # The UDF was DataFrame -> Series - return False - elif is_frame: - # The UDF was DataFrame -> DataFrame - return obj.index.equals(axes[0]) and obj.columns.equals(axes[1]) + return obj.axes[axis].equals(axes[axis]) + elif isinstance(obj, DataFrame): + return obj.axes[axis].equals(axes[axis]) return False diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index fbb4749bf92ab..6c6b674ef6aab 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -248,7 +248,7 @@ def test_transform_groupby_kernel_series(string_series, op): args = [0.0] if op == "fillna" else [] ones = np.ones(string_series.shape[0]) - expected = string_series.groupby(ones, group_keys=False).transform(op, *args) + expected = string_series.groupby(ones).transform(op, *args) result = string_series.transform(op, 0, *args) tm.assert_series_equal(result, expected) @@ -271,9 +271,7 @@ def test_transform_groupby_kernel_frame( ones = np.ones(float_frame.shape[0]) else: ones = np.ones(float_frame.shape[1]) - expected = float_frame.groupby(ones, axis=axis, group_keys=False).transform( - op, *args - ) + expected = float_frame.groupby(ones, axis=axis).transform(op, *args) result = float_frame.transform(op, axis, *args) tm.assert_frame_equal(result, expected) @@ -286,9 +284,7 @@ def test_transform_groupby_kernel_frame( ones = np.ones(float_frame.shape[0]) else: ones = np.ones(float_frame.shape[1]) - expected2 = float_frame.groupby(ones, axis=axis, group_keys=False).transform( - op, *args - ) + expected2 = float_frame.groupby(ones, axis=axis).transform(op, *args) result2 = float_frame.transform(op, axis, *args) tm.assert_frame_equal(result2, expected2) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index f98f4f04e5d67..c9acac04e1f2c 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -220,7 +220,8 @@ def test_getitem_scalar(self, data): class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): - pass + def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): + super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) class TestInterface(BaseNumPyTests, base.BaseInterfaceTests): diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index eb60c4db0a4c8..135e8cc7b7aba 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -771,7 +771,7 @@ def test_groupby_finalize(obj, method): @not_implemented_mark def test_groupby_finalize_not_implemented(obj, method): obj.attrs = {"a": 1} - result = method(obj.groupby([0, 0], group_keys=False)) + result = method(obj.groupby([0, 0])) assert result.attrs == {"a": 1} diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index d3cfd1b05be2d..f178f85154319 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -137,7 +137,7 @@ def test_groupby_aggregation_multi_level_column(): def test_agg_apply_corner(ts, tsframe): # nothing to group, all NA - grouped = ts.groupby(ts * np.nan, group_keys=False) + grouped = ts.groupby(ts * np.nan) assert ts.dtype == np.float64 # groupby float64 values results in Float64Index @@ -147,7 +147,7 @@ def test_agg_apply_corner(ts, tsframe): tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) # DataFrame - grouped = tsframe.groupby(tsframe["A"] * np.nan, group_keys=False) + grouped = tsframe.groupby(tsframe["A"] * np.nan) exp_df = DataFrame( columns=tsframe.columns, dtype=float, diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index c33726701254a..4b7655cf1667c 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -337,17 +337,6 @@ def f(piece): tm.assert_index_equal(result.index, ts.index) -def test_apply_series_to_frame_new_index(): - # GH 34988 - df = DataFrame({"A": list("xy"), "B": [1, 2]}) - - mi = MultiIndex.from_arrays([list("xy"), [0, 0]], names=("A", None)) - expected = DataFrame({"index": [0, 1]}, index=mi) - - result = df.groupby("A")["B"].apply(lambda x: x.reset_index(drop=False)[["index"]]) - tm.assert_frame_equal(result, expected) - - def test_apply_series_yield_constant(df): result = df.groupby(["A", "B"])["C"].apply(len) assert result.index.names[:2] == ("A", "B") @@ -1032,18 +1021,6 @@ def test_groupby_apply_group_keys_warns(): tm.assert_series_equal(result, df["B"]) -@pytest.mark.xfail( - reason="BinGrouper and Grouper aren't consistent with NA key handling" -) -def test_resample_with_only_nat(self): - # https://github.com/pandas-dev/pandas/issues/35251 - pi = pd.PeriodIndex([pd.NaT] * 3, freq="S") - frame = DataFrame([2, 3, 5], index=pi) - - with tm.assert_produces_warning(None): - frame.resample("1s").mean() - - def test_apply_with_timezones_aware(): # GH: 27212 dates = ["2001-01-01"] * 2 + ["2001-01-02"] * 2 + ["2001-01-03"] * 2 diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 4a239b86ead26..6b104ff517c4b 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -767,7 +767,7 @@ def test_cummin(dtypes_for_minmax): def test_cummin_max_all_nan_column(method, dtype): base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) base_df["B"] = base_df["B"].astype(dtype) - grouped = base_df.groupby("A", group_keys=False) + grouped = base_df.groupby("A") expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype) result = getattr(grouped, method)() diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index c6e4bec3f7b2c..2a5d1569a41ed 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -669,7 +669,7 @@ def test_evaluate_with_empty_groups(self, func, expected): # (not testing other agg fns, because they return # different index objects. df = DataFrame({1: [], 2: []}) - g = df.groupby(1, group_keys=False) + g = df.groupby(1) result = getattr(g[2], func)(lambda x: x) tm.assert_series_equal(result, expected) From f46c59a811e430d59691d91d1e89361aaf0f7a8b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 13 Nov 2021 23:17:38 -0500 Subject: [PATCH 62/67] Added test --- pandas/tests/groupby/test_apply.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 4b7655cf1667c..2773f461a6d1e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1002,6 +1002,15 @@ def test_apply_result_type(group_keys, udf): assert series_result.index.nlevels == 1 +def test_result_order_group_keys_false(): + # GH 34998 + # apply result order should not depend on whether index is the same or just equal + df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]}) + result = df.groupby("A", group_keys=False).apply(lambda x: x) + expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) + tm.assert_frame_equal(result, expected) + + def test_groupby_apply_group_keys_warns(): df = DataFrame({"A": [0, 1, 1], "B": [1, 2, 3]}) msg = "Not prepending group keys to the result index" From 27ed908f929ee583e0b1f72c8a0c5129df3df2f8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 14 Nov 2021 08:34:27 -0500 Subject: [PATCH 63/67] type-hint fixups --- pandas/core/frame.py | 4 ++-- pandas/core/generic.py | 2 +- pandas/core/groupby/groupby.py | 6 +++--- pandas/core/resample.py | 4 ++-- pandas/core/series.py | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e2c4fa9f08d70..bb42d0c4f8267 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7762,7 +7762,7 @@ def groupby( level: Level | None = None, as_index: bool = True, sort: bool = True, - group_keys: bool = no_default, + group_keys: bool | lib.NoDefault = no_default, squeeze: bool | lib.NoDefault = no_default, observed: bool = False, dropna: bool = True, @@ -10524,7 +10524,7 @@ def resample( level=None, origin: str | TimestampConvertibleTypes = "start_day", offset: TimedeltaConvertibleTypes | None = None, - group_keys: bool = no_default, + group_keys: bool | lib.NoDefault = no_default, ) -> Resampler: return super().resample( rule=rule, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index dd97f294b3a0f..3539fac1e5785 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7818,7 +7818,7 @@ def resample( level=None, origin: str | TimestampConvertibleTypes = "start_day", offset: TimedeltaConvertibleTypes | None = None, - group_keys: bool_t = lib.no_default, + group_keys: bool_t | lib.NoDefault = lib.no_default, ) -> Resampler: """ Resample time-series data. diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 73c6e616ca797..e94fd6c39a727 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -579,7 +579,7 @@ class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): axis: int grouper: ops.BaseGrouper - group_keys: bool + group_keys: bool | lib.NoDefault @final def __len__(self) -> int: @@ -845,7 +845,7 @@ def __init__( selection: IndexLabel | None = None, as_index: bool = True, sort: bool = True, - group_keys: bool = True, + group_keys: bool | lib.NoDefault = True, squeeze: bool = False, observed: bool = False, mutated: bool = False, @@ -3757,7 +3757,7 @@ def get_groupby( selection=None, as_index: bool = True, sort: bool = True, - group_keys: bool = True, + group_keys: bool | lib.NoDefault = True, squeeze: bool = False, observed: bool = False, mutated: bool = False, diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 3f05a88e4aa67..a5106e295c0ab 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -147,7 +147,7 @@ def __init__( axis: int = 0, kind=None, *, - group_keys=lib.no_default, + group_keys: bool | lib.NoDefault = lib.no_default, selection=None, **kwargs, ): @@ -1461,7 +1461,7 @@ def __init__( base: int | None = None, origin: str | TimestampConvertibleTypes = "start_day", offset: TimedeltaConvertibleTypes | None = None, - group_keys: bool | None = True, + group_keys: bool | lib.NoDefault = True, **kwargs, ): # Check for correctness of the keyword arguments which would diff --git a/pandas/core/series.py b/pandas/core/series.py index 9f0a27001f0b1..c20e8361113f3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1874,7 +1874,7 @@ def groupby( level=None, as_index: bool = True, sort: bool = True, - group_keys: bool = no_default, + group_keys: bool | lib.NoDefault = no_default, squeeze: bool | lib.NoDefault = no_default, observed: bool = False, dropna: bool = True, @@ -5338,7 +5338,7 @@ def resample( level=None, origin: str | TimestampConvertibleTypes = "start_day", offset: TimedeltaConvertibleTypes | None = None, - group_keys: bool = no_default, + group_keys: bool | lib.NoDefault = no_default, ) -> Resampler: return super().resample( rule=rule, From c84fa45188afc223558028bb394b90a0ff464007 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 4 Dec 2021 22:11:44 -0500 Subject: [PATCH 64/67] Doc fixups --- doc/source/whatsnew/v1.4.0.rst | 42 +++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 9817c597dc62c..bf272601a6c63 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -196,7 +196,8 @@ to the index in the resample when :meth:`.Resampler.apply` is used. .. warning:: Not specifying the ``group_keys`` argument will retain the - previous behavior and emit a warning. In a future version + previous behavior and emit a warning if the result will change + by specifying ``group_keys=False``. In a future version of pandas, not specifying ``group_keys`` will default to the same behavior as ``group_keys=False``. @@ -212,25 +213,28 @@ to the index in the resample when :meth:`.Resampler.apply` is used. Previously, the resulting index would depend upon the values returned by ``apply``, as seen in the following example. -.. code-block:: python +.. code-block:: ipython - >>> # pandas 1.3 - >>> df.resample("D").apply(lambda x: x) - a - 2021-01-01 00:00:00 0 - 2021-01-01 08:00:00 1 - 2021-01-01 16:00:00 2 - 2021-01-02 00:00:00 3 - 2021-01-02 08:00:00 4 - 2021-01-02 16:00:00 5 - >>> df.resample("D").apply(lambda x: x.reset_index()) - index a - 2021-01-01 0 2021-01-01 00:00:00 0 - 1 2021-01-01 08:00:00 1 - 2 2021-01-01 16:00:00 2 - 2021-01-02 0 2021-01-02 00:00:00 3 - 1 2021-01-02 08:00:00 4 - 2 2021-01-02 16:00:00 5 + In [1]: # pandas 1.3 + In [2]: df.resample("D").apply(lambda x: x) + Out[2]: + a + 2021-01-01 00:00:00 0 + 2021-01-01 08:00:00 1 + 2021-01-01 16:00:00 2 + 2021-01-02 00:00:00 3 + 2021-01-02 08:00:00 4 + 2021-01-02 16:00:00 5 + + In [3]: df.resample("D").apply(lambda x: x.reset_index()) + Out[3]: + index a + 2021-01-01 0 2021-01-01 00:00:00 0 + 1 2021-01-01 08:00:00 1 + 2 2021-01-01 16:00:00 2 + 2021-01-02 0 2021-01-02 00:00:00 3 + 1 2021-01-02 08:00:00 4 + 2 2021-01-02 16:00:00 5 .. _whatsnew_140.enhancements.other: From 808efc462e5caca4745b224ccc7c63a2fe06e8eb Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 22 Jan 2022 13:29:17 -0500 Subject: [PATCH 65/67] Move notes from 1.4 to 1.5, added deprecation note for .groupby(...).apply --- doc/source/user_guide/groupby.rst | 7 ++-- doc/source/whatsnew/v1.4.0.rst | 51 ----------------------- doc/source/whatsnew/v1.5.0.rst | 69 +++++++++++++++++++++++++++++-- pandas/core/generic.py | 8 ++-- pandas/core/shared_docs.py | 2 +- 5 files changed, 75 insertions(+), 62 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index abc9a0843eeb1..bc772b5dab66c 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1056,8 +1056,9 @@ for both ``aggregate`` and ``transform`` in many standard use cases. However, .. note:: - ``apply`` can act as a reducer, transformer, *or* filter function, depending on exactly what is passed to it. - So depending on the path taken, and exactly what you are grouping. Thus the grouped column(s) may be included in + ``apply`` can act as a reducer, transformer, *or* filter function, depending + on exactly what is passed to it. It can depend on the passed function and + exactly what you are grouping. Thus the grouped column(s) may be included in the output as well as set the indices. .. ipython:: python @@ -1105,7 +1106,7 @@ Control grouped column(s) placement with ``group_keys`` not be added for like-indexed outputs. In the future this behavior will change to always respect ``group_keys``, which defaults to ``True``. - .. versionchanged:: 1.4.0 + .. versionchanged:: 1.5.0 To control whether the grouped column(s) are included in the indices, you can use the argument ``group_keys``. Compare diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2a392c3a33b7b..47a087d38d146 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -197,57 +197,6 @@ library to produce a tight representation of :class:`DataFrame` objects df df.to_dict(orient='tight') -.. _whatsnew_140.resample_group_keys: - -Control of index with ``group_keys`` in :meth:`DataFrame.resample` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The argument ``group_keys`` has been added to the method :meth:`DataFrame.resample`. -As with :meth:`DataFrame.groupby`, this argument controls the whether each group is added -to the index in the resample when :meth:`.Resampler.apply` is used. - -.. warning:: - Not specifying the ``group_keys`` argument will retain the - previous behavior and emit a warning if the result will change - by specifying ``group_keys=False``. In a future version - of pandas, not specifying ``group_keys`` will default to - the same behavior as ``group_keys=False``. - -.. ipython:: python - - df = pd.DataFrame( - {'a': range(6)}, - index=pd.date_range("2021-01-01", periods=6, freq="8H") - ) - df.resample("D", group_keys=True).apply(lambda x: x) - df.resample("D", group_keys=False).apply(lambda x: x) - -Previously, the resulting index would depend upon the values returned by ``apply``, -as seen in the following example. - -.. code-block:: ipython - - In [1]: # pandas 1.3 - In [2]: df.resample("D").apply(lambda x: x) - Out[2]: - a - 2021-01-01 00:00:00 0 - 2021-01-01 08:00:00 1 - 2021-01-01 16:00:00 2 - 2021-01-02 00:00:00 3 - 2021-01-02 08:00:00 4 - 2021-01-02 16:00:00 5 - - In [3]: df.resample("D").apply(lambda x: x.reset_index()) - Out[3]: - index a - 2021-01-01 0 2021-01-01 00:00:00 0 - 1 2021-01-01 08:00:00 1 - 2 2021-01-01 16:00:00 2 - 2021-01-02 0 2021-01-02 00:00:00 3 - 1 2021-01-02 08:00:00 4 - 2 2021-01-02 16:00:00 5 - .. _whatsnew_140.enhancements.other: Other enhancements diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 94edbdea8e72c..930835550da0b 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -22,10 +22,56 @@ Styler - New method :meth:`.Styler.to_string` for alternative customisable output methods (:issue:`44502`) - Various bug fixes, see below. -.. _whatsnew_150.enhancements.enhancement2: +.. _whatsnew_150.enhancements.resample_group_keys: -enhancement2 -^^^^^^^^^^^^ +Control of index with ``group_keys`` in :meth:`DataFrame.resample` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The argument ``group_keys`` has been added to the method :meth:`DataFrame.resample`. +As with :meth:`DataFrame.groupby`, this argument controls the whether each group is added +to the index in the resample when :meth:`.Resampler.apply` is used. + +.. warning:: + Not specifying the ``group_keys`` argument will retain the + previous behavior and emit a warning if the result will change + by specifying ``group_keys=False``. In a future version + of pandas, not specifying ``group_keys`` will default to + the same behavior as ``group_keys=False``. + +.. ipython:: python + + df = pd.DataFrame( + {'a': range(6)}, + index=pd.date_range("2021-01-01", periods=6, freq="8H") + ) + df.resample("D", group_keys=True).apply(lambda x: x) + df.resample("D", group_keys=False).apply(lambda x: x) + +Previously, the resulting index would depend upon the values returned by ``apply``, +as seen in the following example. + +.. code-block:: ipython + + In [1]: # pandas 1.3 + In [2]: df.resample("D").apply(lambda x: x) + Out[2]: + a + 2021-01-01 00:00:00 0 + 2021-01-01 08:00:00 1 + 2021-01-01 16:00:00 2 + 2021-01-02 00:00:00 3 + 2021-01-02 08:00:00 4 + 2021-01-02 16:00:00 5 + + In [3]: df.resample("D").apply(lambda x: x.reset_index()) + Out[3]: + index a + 2021-01-01 0 2021-01-01 00:00:00 0 + 1 2021-01-01 08:00:00 1 + 2 2021-01-01 16:00:00 2 + 2021-01-02 0 2021-01-02 00:00:00 3 + 1 2021-01-02 08:00:00 4 + 2 2021-01-02 16:00:00 5 .. _whatsnew_150.enhancements.other: @@ -140,6 +186,23 @@ use ``series.loc[i:j]``. Slicing on a :class:`DataFrame` will not be affected. +.. _whatsnew_150.deprecations.group_keys_in_apply: + +Using ``group_keys`` with transformers in :meth:`.GroupBy.apply` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, if it was inferred that the function passed to +:meth:`.GroupBy.apply` was a transformer (i.e. the resulting index was equal to +the input index), the ``group_keys`` argument of :meth:`DataFrame.groupby` and +:meth:`Series.groupby` was ignored and the group keys would never be added to +the index of the result. In the future, the group keys will be added to the index +when the user specifies ``group_keys=True``. + +As ``group_keys=True`` is the default value of :meth:`DataFrame.groupby` and +:meth:`Series.groupby`, not specifying ``group_keys`` with a transformer will +raise a ``FutureWarning``. This can be silenced and the previous behavior +retained by specifying ``group_keys=False``. + .. _whatsnew_150.deprecations.other: Other Deprecations diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8ae5b3afadf12..8b78b767f33e5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7992,13 +7992,13 @@ def resample( group_keys : bool, optional Whether to include the group keys in the result index when using ``.apply()`` on the resampled object. Not specifying ``group_keys`` - will retain values-dependent behavior from pandas 1.3 - and earlier (see - :ref:`pandas 1.4.0 Release notes` + will retain values-dependent behavior from pandas 1.4 + and earlier (see :ref:`pandas 1.5.0 Release notes + ` for examples). In a future version of pandas, the behavior will default to the same as specifying ``group_keys=False``. - .. versionadded:: 1.4.0 + .. versionadded:: 1.5.0 Returns ------- diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 21dd79585f5d1..61a33722ff86c 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -118,7 +118,7 @@ By default group keys are not included when the result's index (and column) labels match the inputs, and are included otherwise. - .. versionchanged:: 1.4.0 + .. versionchanged:: 1.5.0 Warns that `group_keys` will no longer be ignored when the result from ``apply`` is a like-indexed Series or DataFrame. From 215d9a8b3bf2603855640ed9c076bc2ca08f6cdc Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 22 Jan 2022 16:10:54 -0500 Subject: [PATCH 66/67] Avoid warnings, is_empty_agg -> is_agg --- doc/source/whatsnew/v1.4.0.rst | 16 +++++++++++++--- pandas/core/groupby/groupby.py | 17 +++++++++++------ pandas/tests/generic/test_finalize.py | 2 +- .../tests/groupby/aggregate/test_aggregate.py | 6 +++--- pandas/tests/groupby/test_apply.py | 10 +++++----- pandas/tests/groupby/test_groupby.py | 4 ++-- pandas/tests/groupby/test_grouping.py | 2 +- pandas/tests/resample/test_base.py | 2 +- pandas/tests/window/test_groupby.py | 18 +++++++++--------- 9 files changed, 46 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 47a087d38d146..f8c813e615d37 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -455,10 +455,20 @@ result's index is not the same as the input's. *New behavior*: -.. ipython:: python +.. code-block:: ipython - df.groupby(['a']).apply(func) - df.set_index(['a', 'b']).groupby(['a']).apply(func) + In [5]: df.groupby(['a']).apply(func) + Out[5]: + a b c + 0 1 3 5 + 1 2 4 6 + + In [6]: df.set_index(['a', 'b']).groupby(['a']).apply(func) + Out[6]: + c + a b + 1 3 5 + 2 4 6 Now in both cases it is determined that ``func`` is a transform. In each case, the result has the same index as the input. diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a079e5ff949b5..3c39442fe15a0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1448,7 +1448,7 @@ def _python_apply_general( data: DataFrame | Series, not_indexed_same: bool | None = None, is_transform: bool = False, - is_empty_agg: bool = False, + is_agg: bool = False, ) -> DataFrame | Series: """ Apply function f in python space @@ -1468,9 +1468,9 @@ def _python_apply_general( and should not have group keys prepended. This is used in _make_wrapper which generates both transforms (e.g. diff) and non-transforms (e.g. corr) - is_empty_agg : bool, default False - Indicator for whether the function is actually an aggregation - on an empty result. We don't want to warn for this case. + is_agg : bool, default False + Indicator for whether the function is an aggregation. When the + result is empty, we don't want to warn for this case. See _GroupBy._python_agg_general. Returns @@ -1483,6 +1483,7 @@ def _python_apply_general( not_indexed_same = mutated or self.mutated override_group_keys = False + is_empty_agg = is_agg and len(values) == 0 if (not not_indexed_same and self.group_keys is lib.no_default) and not ( is_transform or is_empty_agg ): @@ -1523,7 +1524,7 @@ def _python_agg_general(self, func, *args, **kwargs): if self.ngroups == 0: # agg_series below assumes ngroups > 0 - return self._python_apply_general(f, self._selected_obj) + return self._python_apply_general(f, self._selected_obj, is_agg=True) for idx, obj in enumerate(self._iterate_slices()): name = obj.name @@ -3270,7 +3271,11 @@ def rank( if axis != 0: # DataFrame uses different keyword name kwargs["method"] = kwargs.pop("ties_method") - return self.apply(lambda x: x.rank(axis=axis, numeric_only=False, **kwargs)) + f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs) + result = self._python_apply_general( + f, self._selected_obj, is_transform=True + ) + return result return self._cython_transform( "rank", diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 403e5c6c7daf7..54111dda66ac3 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -754,7 +754,7 @@ def test_categorical_accessor(method): ) def test_groupby_finalize(obj, method): obj.attrs = {"a": 1} - result = method(obj.groupby([0, 0])) + result = method(obj.groupby([0, 0], group_keys=False)) assert result.attrs == {"a": 1} diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 2ab553434873c..76a9fe4d2f982 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -133,7 +133,7 @@ def test_groupby_aggregation_multi_level_column(): def test_agg_apply_corner(ts, tsframe): # nothing to group, all NA - grouped = ts.groupby(ts * np.nan) + grouped = ts.groupby(ts * np.nan, group_keys=False) assert ts.dtype == np.float64 # groupby float64 values results in Float64Index @@ -143,7 +143,7 @@ def test_agg_apply_corner(ts, tsframe): tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) # DataFrame - grouped = tsframe.groupby(tsframe["A"] * np.nan) + grouped = tsframe.groupby(tsframe["A"] * np.nan, group_keys=False) exp_df = DataFrame( columns=tsframe.columns, dtype=float, @@ -914,7 +914,7 @@ def test_groupby_aggregate_empty_key_empty_return(): def test_groupby_aggregate_empty_with_multiindex_frame(): # GH 39178 df = DataFrame(columns=["a", "b", "c"]) - result = df.groupby(["a", "b"]).agg(d=("c", list)) + result = df.groupby(["a", "b"], group_keys=False).agg(d=("c", list)) expected = DataFrame( columns=["d"], index=MultiIndex([[], []], [[], []], names=["a", "b"]) ) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 32315985cc129..b45592833f1aa 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -575,7 +575,7 @@ def test_apply_corner_cases(): } ) - grouped = df.groupby("key") + grouped = df.groupby("key", group_keys=False) def f(g): g["value3"] = g["value1"] * 2 @@ -1189,9 +1189,9 @@ def test_positional_slice_groups_datetimelike(): "let": list("abcde"), } ) - result = expected.groupby([expected.let, expected.date.dt.date]).apply( - lambda x: x.iloc[0:] - ) + result = expected.groupby( + [expected.let, expected.date.dt.date], group_keys=False + ).apply(lambda x: x.iloc[0:]) tm.assert_frame_equal(result, expected) @@ -1306,5 +1306,5 @@ def test_apply_index_key_error_bug(index_values): def test_apply_nonmonotonic_float_index(arg, idx): # GH 34455 expected = DataFrame({"col": arg}, index=idx) - result = expected.groupby("col").apply(lambda x: x) + result = expected.groupby("col", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 20ff811e64d3b..516e5fb7463de 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1920,7 +1920,7 @@ def test_empty_groupby(columns, keys, values, method, op, request, using_array_m df = df.iloc[:0] - gb = df.groupby(keys)[columns] + gb = df.groupby(keys, group_keys=False)[columns] def get_result(): if method == "attr": @@ -2041,7 +2041,7 @@ def test_empty_groupby_apply_nonunique_columns(): df = DataFrame(np.random.randn(0, 4)) df[3] = df[3].astype(np.int64) df.columns = [0, 1, 2, 0] - gb = df.groupby(df[1]) + gb = df.groupby(df[1], group_keys=False) res = gb.apply(lambda x: x) assert (res.dtypes == df.dtypes).all() diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 2a5d1569a41ed..c6e4bec3f7b2c 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -669,7 +669,7 @@ def test_evaluate_with_empty_groups(self, func, expected): # (not testing other agg fns, because they return # different index objects. df = DataFrame({1: [], 2: []}) - g = df.groupby(1) + g = df.groupby(1, group_keys=False) result = getattr(g[2], func)(lambda x: x) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index afc020db4bb57..efc7eeec4fb04 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -228,7 +228,7 @@ def test_resample_empty_dtypes(index, dtype, resample_method): def test_apply_to_empty_series(empty_series_dti, freq): # GH 14313 ser = empty_series_dti - result = ser.resample(freq).apply(lambda x: 1) + result = ser.resample(freq, group_keys=False).apply(lambda x: 1) expected = ser.resample(freq).apply(np.sum) tm.assert_series_equal(result, expected, check_dtype=False) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index c69a3a4b4cc06..a2a2ab4438c00 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -78,7 +78,7 @@ def test_getitem_multiple(self): ], ) def test_rolling(self, f): - g = self.frame.groupby("A") + g = self.frame.groupby("A", group_keys=False) r = g.rolling(window=4) result = getattr(r, f)() @@ -92,7 +92,7 @@ def test_rolling(self, f): @pytest.mark.parametrize("f", ["std", "var"]) def test_rolling_ddof(self, f): - g = self.frame.groupby("A") + g = self.frame.groupby("A", group_keys=False) r = g.rolling(window=4) result = getattr(r, f)(ddof=1) @@ -108,7 +108,7 @@ def test_rolling_ddof(self, f): "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] ) def test_rolling_quantile(self, interpolation): - g = self.frame.groupby("A") + g = self.frame.groupby("A", group_keys=False) r = g.rolling(window=4) result = r.quantile(0.4, interpolation=interpolation) @@ -213,7 +213,7 @@ def test_rolling_corr_cov_unordered(self, func, expected_values): tm.assert_frame_equal(result, expected) def test_rolling_apply(self, raw): - g = self.frame.groupby("A") + g = self.frame.groupby("A", group_keys=False) r = g.rolling(window=4) # reduction @@ -755,7 +755,7 @@ def test_groupby_rolling_resulting_multiindex3(self): def test_groupby_rolling_object_doesnt_affect_groupby_apply(self): # GH 39732 - g = self.frame.groupby("A") + g = self.frame.groupby("A", group_keys=False) expected = g.apply(lambda x: x.rolling(4).sum()).index _ = g.rolling(window=4) result = g.apply(lambda x: x.rolling(4).sum()).index @@ -904,7 +904,7 @@ def setup_method(self): "f", ["sum", "mean", "min", "max", "count", "kurt", "skew"] ) def test_expanding(self, f): - g = self.frame.groupby("A") + g = self.frame.groupby("A", group_keys=False) r = g.expanding() result = getattr(r, f)() @@ -918,7 +918,7 @@ def test_expanding(self, f): @pytest.mark.parametrize("f", ["std", "var"]) def test_expanding_ddof(self, f): - g = self.frame.groupby("A") + g = self.frame.groupby("A", group_keys=False) r = g.expanding() result = getattr(r, f)(ddof=0) @@ -934,7 +934,7 @@ def test_expanding_ddof(self, f): "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] ) def test_expanding_quantile(self, interpolation): - g = self.frame.groupby("A") + g = self.frame.groupby("A", group_keys=False) r = g.expanding() result = r.quantile(0.4, interpolation=interpolation) @@ -977,7 +977,7 @@ def func(x): tm.assert_series_equal(result, expected) def test_expanding_apply(self, raw): - g = self.frame.groupby("A") + g = self.frame.groupby("A", group_keys=False) r = g.expanding() # reduction From 14b9b0f7b7cd98921beeaf1e04b651cf62ef8c22 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 8 Feb 2022 18:21:23 -0500 Subject: [PATCH 67/67] Suppress warning in tests --- pandas/tests/groupby/test_apply_mutate.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py index 01fe7512c0fe9..36e117cf03353 100644 --- a/pandas/tests/groupby/test_apply_mutate.py +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -13,8 +13,10 @@ def test_group_by_copy(): } ).set_index("name") - grp_by_same_value = df.groupby(["age"]).apply(lambda group: group) - grp_by_copy = df.groupby(["age"]).apply(lambda group: group.copy()) + grp_by_same_value = df.groupby(["age"], group_keys=False).apply(lambda group: group) + grp_by_copy = df.groupby(["age"], group_keys=False).apply( + lambda group: group.copy() + ) tm.assert_frame_equal(grp_by_same_value, grp_by_copy)