From e84a15dfa19a5130c99825c16f6f2f350ce36b4a Mon Sep 17 00:00:00 2001 From: smithto1 Date: Wed, 15 Jul 2020 00:58:41 +0100 Subject: [PATCH 01/34] GroupBy.apply() calls self._reset_group_selection at the start. Errant tests updated --- pandas/core/groupby/groupby.py | 2 ++ pandas/tests/groupby/test_categorical.py | 4 ++-- pandas/tests/groupby/test_grouping.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d039b715b3c08..f570d3284e3f5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -825,6 +825,8 @@ def __iter__(self): ) ) def apply(self, func, *args, **kwargs): + + self._reset_group_selection() func = self._is_builtin_func(func) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 118d928ac02f4..96a5e83459a59 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -129,9 +129,9 @@ def test_basic(): def f(x): return x.drop_duplicates("person_name").iloc[0] + g = x.groupby(["person_id"], observed=False, as_index=False) result = g.apply(f) expected = x.iloc[[0, 1]].copy() - expected.index = Index([1, 2], name="person_id") expected["person_name"] = expected["person_name"].astype("object") tm.assert_frame_equal(result, expected) @@ -1287,7 +1287,7 @@ def test_get_nonexistent_category(): # Accessing a Category that is not in the dataframe df = pd.DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) with pytest.raises(KeyError, match="'vau'"): - df.groupby("var").apply( + df.groupby("var", as_index=False).apply( lambda rows: pd.DataFrame( {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} ) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index efcd22f9c0c82..61588bd6ce165 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -191,7 +191,7 @@ def test_grouper_creation_bug(self): result = g.sum() tm.assert_frame_equal(result, expected) - result = g.apply(lambda x: x.sum()) + result = g[["B"]].apply(lambda x: x.sum()) tm.assert_frame_equal(result, expected) g = df.groupby(pd.Grouper(key="A", axis=0)) From e122809eae5c8d55c593541c537427a943270680 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Wed, 15 Jul 2020 15:39:33 +0100 Subject: [PATCH 02/34] gb.apply() now resets group selection so it always returns grouping columns as columns. updated tests that relied on previous behaviour --- pandas/core/groupby/groupby.py | 2 +- pandas/tests/groupby/aggregate/test_other.py | 4 ++-- pandas/tests/groupby/test_function.py | 2 +- pandas/tests/groupby/transform/test_transform.py | 8 +++----- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 972ece1ab549f..a29502ce1363d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -828,7 +828,7 @@ def __iter__(self): ) ) def apply(self, func, *args, **kwargs): - + self._reset_group_selection() func = self._is_builtin_func(func) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 264cf40dc6984..3356c2dd9c88b 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -486,13 +486,13 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0] + assert ts == grouped.apply(lambda x: x.iloc[0]).loc["a", "B"] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0] + assert ts == grouped.apply(lambda x: x.iloc[-1]).loc["a", "B"] def test_sum_uint64_overflow(): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 6f19ec40c2520..35c9bd4c33fdc 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -298,7 +298,7 @@ def test_non_cython_api(): index=expected_index, columns=expected_col, ) - result = g.describe() + result = g.describe().drop(columns="A") tm.assert_frame_equal(result, expected) expected = pd.concat( diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index cdaf27e214d80..4caf50b0df1fa 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -728,11 +728,6 @@ def test_cython_transform_frame(op, args, targop): # dict(by=['int','string'])]: gb = df.groupby(**gb_target) - # allowlisted methods set the selection before applying - # bit a of hack to make sure the cythonized shift - # is equivalent to pre 0.17.1 behavior - if op == "shift": - gb._set_group_selection() if op != "shift" and "int" not in gb_target: # numeric apply fastpath promotes dtype so have @@ -743,6 +738,9 @@ def test_cython_transform_frame(op, args, targop): else: expected = gb.apply(targop) + if op == "shift" and type(gb_target.get("by")) is str: + expected = expected.drop(columns=gb_target.get("by")) + expected = expected.sort_index(axis=1) tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index(axis=1)) tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1)) From 27b2694203057c1e3691a667d4337bc363e1744c Mon Sep 17 00:00:00 2001 From: smithto1 Date: Wed, 15 Jul 2020 15:46:41 +0100 Subject: [PATCH 03/34] test uses .drop() instead of selection --- pandas/tests/groupby/test_grouping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 61588bd6ce165..9122b8de4c2fa 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -191,7 +191,7 @@ def test_grouper_creation_bug(self): result = g.sum() tm.assert_frame_equal(result, expected) - result = g[["B"]].apply(lambda x: x.sum()) + result = g.apply(lambda x: x.sum()).drop(columns="A") tm.assert_frame_equal(result, expected) g = df.groupby(pd.Grouper(key="A", axis=0)) From 0cca6df39246f06734d820b13b78d36581aec5b4 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Wed, 15 Jul 2020 16:44:15 +0100 Subject: [PATCH 04/34] wrote new tests --- pandas/tests/groupby/test_apply.py | 40 ++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index aa10f44670361..a6aa9bdd7814d 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1010,3 +1010,43 @@ def test_apply_with_timezones_aware(): result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) tm.assert_frame_equal(result1, result2) + + +@pytest.mark.parametrize( + "func", ["sum", "min", "max", "mean", "std", "prod", "cumprod", "cumsum"] +) +def test_apply_is_unchanged_when_other_methods_are_clled_first(func): + # GH 34656 + # GH 34271 + df = DataFrame( + { + "a": [99, 99, 99, 88, 88, 88], + "b": [1, 2, 3, 4, 5, 6], + "c": [10, 20, 30, 40, 50, 60], + } + ) + + expected = df.groupby("a").apply(getattr(np, func)) + + # Call .apply() without calling any method on the GroupBy beforehand + grp = df.groupby("a") + result = grp.apply(getattr(np, func)) + tm.assert_frame_equal(result, expected) + + # Call .apply() after calling .min() on the GroupBy + grp = df.groupby("a") + grp.min() + result = grp.apply(getattr(np, func)) + tm.assert_frame_equal(result, expected) + + # Call .apply() after calling 'func' on the GroupBy + grp = df.groupby("a") + getattr(grp, func)() + result = grp.apply(getattr(np, func)) + tm.assert_frame_equal(result, expected) + + # Call .apply() after directly calling ._set_group_selection() on the GroupBy + grp = df.groupby("a") + grp._set_group_selection() + result = grp.apply(getattr(np, func)) + tm.assert_frame_equal(result, expected) From 2786eb5e3b2f0ee46659a4c56e4cd29be873e703 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Thu, 16 Jul 2020 23:51:42 +0100 Subject: [PATCH 05/34] rewrote test --- pandas/tests/groupby/test_apply.py | 37 +++++++++++------------------- 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index a6aa9bdd7814d..b9e800c6da84d 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1012,10 +1012,7 @@ def test_apply_with_timezones_aware(): tm.assert_frame_equal(result1, result2) -@pytest.mark.parametrize( - "func", ["sum", "min", "max", "mean", "std", "prod", "cumprod", "cumsum"] -) -def test_apply_is_unchanged_when_other_methods_are_clled_first(func): +def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): # GH 34656 # GH 34271 df = DataFrame( @@ -1026,27 +1023,19 @@ def test_apply_is_unchanged_when_other_methods_are_clled_first(func): } ) - expected = df.groupby("a").apply(getattr(np, func)) - - # Call .apply() without calling any method on the GroupBy beforehand - grp = df.groupby("a") - result = grp.apply(getattr(np, func)) - tm.assert_frame_equal(result, expected) - - # Call .apply() after calling .min() on the GroupBy - grp = df.groupby("a") - grp.min() - result = grp.apply(getattr(np, func)) - tm.assert_frame_equal(result, expected) + expected = pd.DataFrame( + {"a": [264, 297], "b": [15, 6], "c": [150, 60],}, + index=pd.Index([88, 99], name="a"), + ) - # Call .apply() after calling 'func' on the GroupBy - grp = df.groupby("a") - getattr(grp, func)() - result = grp.apply(getattr(np, func)) + # Check output wehn no other methods are called before .apply() + grp = df.groupby(by="a") + result = grp.apply(sum) tm.assert_frame_equal(result, expected) - # Call .apply() after directly calling ._set_group_selection() on the GroupBy - grp = df.groupby("a") - grp._set_group_selection() - result = grp.apply(getattr(np, func)) + # Check output when another methods is called before .apply() + grp = df.groupby(by="a") + args = {"nth": [0], "corrwith": [df]}.get(reduction_func, []) + _ = getattr(grp, reduction_func)(*args) + result = grp.apply(sum) tm.assert_frame_equal(result, expected) From 33cdf65b60bcdf67d7bb1b4b70e3c42c9ae2d675 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Thu, 16 Jul 2020 23:54:24 +0100 Subject: [PATCH 06/34] whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 814dbe999d5c1..bb11085c26d1f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1084,6 +1084,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) - Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) +- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance index column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) Reshaping ^^^^^^^^^ From 6a65e2f4eb64e9f2811d4d2742f0724596c7b658 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 17 Jul 2020 00:03:29 +0100 Subject: [PATCH 07/34] restore if-stat in test_transform --- pandas/tests/groupby/transform/test_transform.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 4caf50b0df1fa..47e5fe2742075 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -728,6 +728,11 @@ def test_cython_transform_frame(op, args, targop): # dict(by=['int','string'])]: gb = df.groupby(**gb_target) + # allowlisted methods set the selection before applying + # bit a of hack to make sure the cythonized shift + # is equivalent to pre 0.17.1 behavior + if op == "shift": + gb._set_group_selection() if op != "shift" and "int" not in gb_target: # numeric apply fastpath promotes dtype so have From 9948d2f898be43af4e985b4da070b31749147baa Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 17 Jul 2020 00:06:57 +0100 Subject: [PATCH 08/34] amended test --- pandas/tests/groupby/test_categorical.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e0d654dc60e9c..e04f7877a352b 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -118,7 +118,7 @@ def test_basic(): ) x["person_name"] = Categorical(x.person_name) - g = x.groupby(["person_id"], observed=False) + g = x.groupby(["person_id"], observed=False, as_index=False) result = g.transform(lambda x: x) tm.assert_frame_equal(result, x[["person_name"]]) @@ -129,7 +129,6 @@ def test_basic(): def f(x): return x.drop_duplicates("person_name").iloc[0] - g = x.groupby(["person_id"], observed=False, as_index=False) result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected["person_name"] = expected["person_name"].astype("object") From e4a132e88b809859736d9131bbcebf4b17df477c Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 17 Jul 2020 00:09:40 +0100 Subject: [PATCH 09/34] restored test --- pandas/tests/groupby/test_categorical.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e04f7877a352b..52ef4fe935bdb 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -118,7 +118,7 @@ def test_basic(): ) x["person_name"] = Categorical(x.person_name) - g = x.groupby(["person_id"], observed=False, as_index=False) + g = x.groupby(["person_id"], observed=False) result = g.transform(lambda x: x) tm.assert_frame_equal(result, x[["person_name"]]) @@ -131,6 +131,7 @@ def f(x): result = g.apply(f) expected = x.iloc[[0, 1]].copy() + expected.index = Index([1, 2], name="person_id") expected["person_name"] = expected["person_name"].astype("object") tm.assert_frame_equal(result, expected) From 4170ca6969fa9d766597495cb6de279e7c26ec76 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 17 Jul 2020 00:11:08 +0100 Subject: [PATCH 10/34] restored test --- pandas/tests/groupby/test_categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 52ef4fe935bdb..7e4513da37dc9 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1287,7 +1287,7 @@ def test_get_nonexistent_category(): # Accessing a Category that is not in the dataframe df = pd.DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) with pytest.raises(KeyError, match="'vau'"): - df.groupby("var", as_index=False).apply( + df.groupby("var").apply( lambda rows: pd.DataFrame( {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} ) From 0ab1c8ae77f0a89adc38a334e819994ccd987667 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 17 Jul 2020 00:14:29 +0100 Subject: [PATCH 11/34] amended test --- pandas/tests/groupby/aggregate/test_other.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 3356c2dd9c88b..e8cd6017a117c 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -486,13 +486,13 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[0]).loc["a", "B"] + assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[-1]).loc["a", "B"] + assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] def test_sum_uint64_overflow(): From f2a32f405c4a52c28827c516f34139b728541c8e Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 17 Jul 2020 00:16:31 +0100 Subject: [PATCH 12/34] cleanup --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/tests/groupby/test_apply.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index bb11085c26d1f..6f727df2cb1c4 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1084,7 +1084,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) - Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) -- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance index column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) +- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) Reshaping ^^^^^^^^^ diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index b9e800c6da84d..0e9cb845dcf63 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1033,7 +1033,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): result = grp.apply(sum) tm.assert_frame_equal(result, expected) - # Check output when another methods is called before .apply() + # Check output when another method is called before .apply() grp = df.groupby(by="a") args = {"nth": [0], "corrwith": [df]}.get(reduction_func, []) _ = getattr(grp, reduction_func)(*args) From f5b674b4580e5cfb3e778659a48c6b2560fd55af Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 17 Jul 2020 00:29:14 +0100 Subject: [PATCH 13/34] trailing comma --- pandas/tests/groupby/test_apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 0e9cb845dcf63..b68b3d74517bc 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1024,7 +1024,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): ) expected = pd.DataFrame( - {"a": [264, 297], "b": [15, 6], "c": [150, 60],}, + {"a": [264, 297], "b": [15, 6], "c": [150, 60]}, index=pd.Index([88, 99], name="a"), ) From 7028756e5d93adf200822a955fbeb0e3090d57c7 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 17 Jul 2020 09:14:15 +0100 Subject: [PATCH 14/34] fixed test_to_latex --- pandas/tests/io/formats/test_to_latex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 509e5bcb33304..c7b2d2de2585d 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -224,7 +224,7 @@ def test_to_latex_multiindex(self): assert result == expected - result = df.groupby("a").describe().to_latex() + result = df.groupby("a").describe().drop(columns='a').to_latex() expected = r"""\begin{tabular}{lrrrrrrrr} \toprule {} & \multicolumn{8}{l}{c} \\ From 45abe63d7810a49312d60355bbc81d805280b18d Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 17 Jul 2020 10:14:20 +0100 Subject: [PATCH 15/34] added test to ensure .describe() keeps non-nuisance groupin columns --- pandas/core/groupby/groupby.py | 9 +++--- pandas/tests/groupby/test_function.py | 44 +++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a29502ce1363d..c542c1ae6e334 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1625,11 +1625,10 @@ def ohlc(self) -> DataFrame: @doc(DataFrame.describe) def describe(self, **kwargs): - with _group_selection_context(self): - result = self.apply(lambda x: x.describe(**kwargs)) - if self.axis == 1: - return result.T - return result.unstack() + result = self.apply(lambda x: x.describe(**kwargs)) + if self.axis == 1: + return result.T + return result.unstack() def resample(self, rule, *args, **kwargs): """ diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 35c9bd4c33fdc..3a994f9265c8a 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -974,6 +974,50 @@ def test_frame_describe_unstacked_format(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("by_col_dtype", [int, float, str]) +def test_describe_results_includes_non_nuisance_columns(by_col_dtype): + # GH 34656 + # GH 34271 + df = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 3, 3], "b": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + df = df.astype({"a": by_col_dtype}) + + expected = ( + DataFrame.from_records( + [ + ("a", "count", 3.0, 3.0, 3.0), + ("a", "mean", 1.0, 2.0, 3.0), + ("a", "std", 0.0, 0.0, 0.0), + ("a", "min", 1.0, 2.0, 3.0), + ("a", "25%", 1.0, 2.0, 3.0), + ("a", "50%", 1.0, 2.0, 3.0), + ("a", "75%", 1.0, 2.0, 3.0), + ("a", "max", 1.0, 2.0, 3.0), + ("b", "count", 3.0, 3.0, 3.0), + ("b", "mean", 2.0, 5.0, 8.0), + ("b", "std", 1.0, 1.0, 1.0), + ("b", "min", 1.0, 4.0, 7.0), + ("b", "25%", 1.5, 4.5, 7.5), + ("b", "50%", 2.0, 5.0, 8.0), + ("b", "75%", 2.5, 5.5, 8.5), + ("b", "max", 3.0, 6.0, 9.0), + ], + columns=["col", "func", 1, 2, 3], + ) + .set_index(["col", "func"]) + .T + ) + expected.columns.names = [None, None] + expected.index = pd.Index(expected.index.astype(by_col_dtype), name="a") + + if by_col_dtype is str: + # If the grouping column is a nuisance column (i.e. can't apply the + # std() or quantile() to it) then it does not appear in the output + expected = expected.drop(columns="a") + + result = df.groupby("a").describe() + tm.assert_frame_equal(result, expected) + + def test_groupby_mean_no_overflow(): # Regression test for (#22487) df = pd.DataFrame( From 063f0ea0320313ad3f553410a88aed9a6adc9ce6 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 17 Jul 2020 11:03:01 +0100 Subject: [PATCH 16/34] minimized changes to exsiting tests --- pandas/tests/groupby/test_function.py | 45 ++++++++++++++++++++---- pandas/tests/groupby/test_grouping.py | 8 +++-- pandas/tests/io/formats/test_to_latex.py | 2 +- 3 files changed, 45 insertions(+), 10 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 3a994f9265c8a..61d8ae3b281b8 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -286,19 +286,52 @@ def test_non_cython_api(): # describe expected_index = pd.Index([1, 3], name="A") - expected_col = pd.MultiIndex( - levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], - codes=[[0] * 8, list(range(8))], + expected_col = pd.MultiIndex.from_product( + [["A", "B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]] ) expected = pd.DataFrame( [ - [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], - [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + [ + 2.0, + 1.0, + 0.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 2.0, + np.nan, + 2.0, + 2.0, + 2.0, + 2.0, + 2.0, + ], + [ + 1.0, + 3.0, + np.nan, + 3.0, + 3.0, + 3.0, + 3.0, + 3.0, + 0.0, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ], ], index=expected_index, columns=expected_col, ) - result = g.describe().drop(columns="A") + result = g.describe() tm.assert_frame_equal(result, expected) expected = pd.concat( diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 9122b8de4c2fa..40b4ce46e550b 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -191,13 +191,15 @@ def test_grouper_creation_bug(self): result = g.sum() tm.assert_frame_equal(result, expected) - result = g.apply(lambda x: x.sum()).drop(columns="A") - tm.assert_frame_equal(result, expected) - g = df.groupby(pd.Grouper(key="A", axis=0)) result = g.sum() tm.assert_frame_equal(result, expected) + result = g.apply(lambda x: x.sum()) + expected["A"] = [0, 2, 4] + expected = expected.loc[:, ["A", "B"]] + tm.assert_frame_equal(result, expected) + # GH14334 # pd.Grouper(key=...) may be passed in a list df = DataFrame( diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index c7b2d2de2585d..053a9b6a9fd38 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -224,7 +224,7 @@ def test_to_latex_multiindex(self): assert result == expected - result = df.groupby("a").describe().drop(columns='a').to_latex() + result = df.groupby("a").describe().drop(columns="a").to_latex() expected = r"""\begin{tabular}{lrrrrrrrr} \toprule {} & \multicolumn{8}{l}{c} \\ From 7f0d192aa172f6fba3d22f2eb036e458a757a222 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 17 Jul 2020 11:04:57 +0100 Subject: [PATCH 17/34] add .describe() to whatsnew --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 6f727df2cb1c4..6c5810a770ece 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1084,7 +1084,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) - Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) -- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) +- Bug in :meth:`DataFrameGroupBy.apply` :meth:`DataFrameGroupBy.describe` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) Reshaping ^^^^^^^^^ From 974da63b1fc4c7bdd53bd45bb8227cea69bcb3d4 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 17 Jul 2020 20:58:47 +0100 Subject: [PATCH 18/34] parametrize test over as_index=T/F --- pandas/tests/groupby/test_function.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 61d8ae3b281b8..595cbe9a0e9de 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1008,7 +1008,8 @@ def test_frame_describe_unstacked_format(): @pytest.mark.parametrize("by_col_dtype", [int, float, str]) -def test_describe_results_includes_non_nuisance_columns(by_col_dtype): +@pytest.mark.parametrize("as_index", [True, False]) +def test_describe_results_includes_non_nuisance_columns(by_col_dtype, as_index): # GH 34656 # GH 34271 df = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 3, 3], "b": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) @@ -1042,12 +1043,15 @@ def test_describe_results_includes_non_nuisance_columns(by_col_dtype): expected.columns.names = [None, None] expected.index = pd.Index(expected.index.astype(by_col_dtype), name="a") + if not as_index: + expected = expected.reset_index(drop=True) + if by_col_dtype is str: # If the grouping column is a nuisance column (i.e. can't apply the # std() or quantile() to it) then it does not appear in the output expected = expected.drop(columns="a") - result = df.groupby("a").describe() + result = df.groupby("a", as_index=as_index).describe() tm.assert_frame_equal(result, expected) From b395e39e72c5c550f78bf3929829d9b39bb47cce Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 18 Jul 2020 13:41:49 +0100 Subject: [PATCH 19/34] restored .describe to old behaviour --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/groupby/groupby.py | 5 + pandas/tests/groupby/test_function.py | 160 +---------------------- pandas/tests/io/formats/test_to_latex.py | 2 +- 4 files changed, 12 insertions(+), 157 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 6c5810a770ece..6f727df2cb1c4 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1084,7 +1084,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) - Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) -- Bug in :meth:`DataFrameGroupBy.apply` :meth:`DataFrameGroupBy.describe` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) +- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c542c1ae6e334..b44a563bdb648 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1628,6 +1628,11 @@ def describe(self, **kwargs): result = self.apply(lambda x: x.describe(**kwargs)) if self.axis == 1: return result.T + if self.as_index: + with _group_selection_context(self): + if self._group_selection is not None: + cols = result.columns.intersection(self._group_selection) + result = result.reindex(columns=cols) return result.unstack() def resample(self, rule, *args, **kwargs): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 595cbe9a0e9de..d136556e5336a 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -286,47 +286,14 @@ def test_non_cython_api(): # describe expected_index = pd.Index([1, 3], name="A") - expected_col = pd.MultiIndex.from_product( - [["A", "B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]] + expected_col = pd.MultiIndex( + levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], + codes=[[0] * 8, list(range(8))], ) expected = pd.DataFrame( [ - [ - 2.0, - 1.0, - 0.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 2.0, - np.nan, - 2.0, - 2.0, - 2.0, - 2.0, - 2.0, - ], - [ - 1.0, - 3.0, - np.nan, - 3.0, - 3.0, - 3.0, - 3.0, - 3.0, - 0.0, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - ], + [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], + [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], ], index=expected_index, columns=expected_col, @@ -334,16 +301,6 @@ def test_non_cython_api(): result = g.describe() tm.assert_frame_equal(result, expected) - expected = pd.concat( - [ - df[df.A == 1].describe().unstack().to_frame().T, - df[df.A == 3].describe().unstack().to_frame().T, - ] - ) - expected.index = pd.Index([0, 1]) - result = gni.describe() - tm.assert_frame_equal(result, expected) - # any expected = DataFrame( [[True, True], [False, True]], columns=["B", "C"], index=[1, 3] @@ -1005,110 +962,3 @@ def test_frame_describe_unstacked_format(): columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("by_col_dtype", [int, float, str]) -@pytest.mark.parametrize("as_index", [True, False]) -def test_describe_results_includes_non_nuisance_columns(by_col_dtype, as_index): - # GH 34656 - # GH 34271 - df = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 3, 3], "b": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - df = df.astype({"a": by_col_dtype}) - - expected = ( - DataFrame.from_records( - [ - ("a", "count", 3.0, 3.0, 3.0), - ("a", "mean", 1.0, 2.0, 3.0), - ("a", "std", 0.0, 0.0, 0.0), - ("a", "min", 1.0, 2.0, 3.0), - ("a", "25%", 1.0, 2.0, 3.0), - ("a", "50%", 1.0, 2.0, 3.0), - ("a", "75%", 1.0, 2.0, 3.0), - ("a", "max", 1.0, 2.0, 3.0), - ("b", "count", 3.0, 3.0, 3.0), - ("b", "mean", 2.0, 5.0, 8.0), - ("b", "std", 1.0, 1.0, 1.0), - ("b", "min", 1.0, 4.0, 7.0), - ("b", "25%", 1.5, 4.5, 7.5), - ("b", "50%", 2.0, 5.0, 8.0), - ("b", "75%", 2.5, 5.5, 8.5), - ("b", "max", 3.0, 6.0, 9.0), - ], - columns=["col", "func", 1, 2, 3], - ) - .set_index(["col", "func"]) - .T - ) - expected.columns.names = [None, None] - expected.index = pd.Index(expected.index.astype(by_col_dtype), name="a") - - if not as_index: - expected = expected.reset_index(drop=True) - - if by_col_dtype is str: - # If the grouping column is a nuisance column (i.e. can't apply the - # std() or quantile() to it) then it does not appear in the output - expected = expected.drop(columns="a") - - result = df.groupby("a", as_index=as_index).describe() - tm.assert_frame_equal(result, expected) - - -def test_groupby_mean_no_overflow(): - # Regression test for (#22487) - df = pd.DataFrame( - { - "user": ["A", "A", "A", "A", "A"], - "connections": [4970, 4749, 4719, 4704, 18446744073699999744], - } - ) - assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 - - -@pytest.mark.parametrize( - "values", - [ - { - "a": [1, 1, 1, 2, 2, 2, 3, 3, 3], - "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2], - }, - {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]}, - ], -) -@pytest.mark.parametrize("function", ["mean", "median", "var"]) -def test_apply_to_nullable_integer_returns_float(values, function): - # https://github.com/pandas-dev/pandas/issues/32219 - output = 0.5 if function == "var" else 1.5 - arr = np.array([output] * 3, dtype=float) - idx = pd.Index([1, 2, 3], dtype=object, name="a") - expected = pd.DataFrame({"b": arr}, index=idx) - - groups = pd.DataFrame(values, dtype="Int64").groupby("a") - - result = getattr(groups, function)() - tm.assert_frame_equal(result, expected) - - result = groups.agg(function) - tm.assert_frame_equal(result, expected) - - result = groups.agg([function]) - expected.columns = MultiIndex.from_tuples([("b", function)]) - tm.assert_frame_equal(result, expected) - - -def test_groupby_sum_below_mincount_nullable_integer(): - # https://github.com/pandas-dev/pandas/issues/32861 - df = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") - grouped = df.groupby("a") - idx = pd.Index([0, 1, 2], dtype=object, name="a") - - result = grouped["b"].sum(min_count=2) - expected = pd.Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") - tm.assert_series_equal(result, expected) - - result = grouped.sum(min_count=2) - expected = pd.DataFrame( - {"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 053a9b6a9fd38..509e5bcb33304 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -224,7 +224,7 @@ def test_to_latex_multiindex(self): assert result == expected - result = df.groupby("a").describe().drop(columns="a").to_latex() + result = df.groupby("a").describe().to_latex() expected = r"""\begin{tabular}{lrrrrrrrr} \toprule {} & \multicolumn{8}{l}{c} \\ From 67e974438bbb8f94dd4169dec16fe36bc64d45e7 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 18 Jul 2020 13:49:54 +0100 Subject: [PATCH 20/34] restoring test_function.py to master --- pandas/tests/groupby/test_function.py | 69 +++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 67cc90d23620d..e693962e57ac3 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -319,6 +319,16 @@ def test_non_cython_api(): result = g.describe() tm.assert_frame_equal(result, expected) + expected = pd.concat( + [ + df[df.A == 1].describe().unstack().to_frame().T, + df[df.A == 3].describe().unstack().to_frame().T, + ] + ) + expected.index = pd.Index([0, 1]) + result = gni.describe() + tm.assert_frame_equal(result, expected) + # any expected = DataFrame( [[True, True], [False, True]], columns=["B", "C"], index=[1, 3] @@ -980,3 +990,62 @@ def test_frame_describe_unstacked_format(): columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) tm.assert_frame_equal(result, expected) + + +def test_groupby_mean_no_overflow(): + # Regression test for (#22487) + df = pd.DataFrame( + { + "user": ["A", "A", "A", "A", "A"], + "connections": [4970, 4749, 4719, 4704, 18446744073699999744], + } + ) + assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 + + +@pytest.mark.parametrize( + "values", + [ + { + "a": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2], + }, + {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]}, + ], +) +@pytest.mark.parametrize("function", ["mean", "median", "var"]) +def test_apply_to_nullable_integer_returns_float(values, function): + # https://github.com/pandas-dev/pandas/issues/32219 + output = 0.5 if function == "var" else 1.5 + arr = np.array([output] * 3, dtype=float) + idx = pd.Index([1, 2, 3], dtype=object, name="a") + expected = pd.DataFrame({"b": arr}, index=idx) + + groups = pd.DataFrame(values, dtype="Int64").groupby("a") + + result = getattr(groups, function)() + tm.assert_frame_equal(result, expected) + + result = groups.agg(function) + tm.assert_frame_equal(result, expected) + + result = groups.agg([function]) + expected.columns = MultiIndex.from_tuples([("b", function)]) + tm.assert_frame_equal(result, expected) + + +def test_groupby_sum_below_mincount_nullable_integer(): + # https://github.com/pandas-dev/pandas/issues/32861 + df = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") + grouped = df.groupby("a") + idx = pd.Index([0, 1, 2], dtype=object, name="a") + + result = grouped["b"].sum(min_count=2) + expected = pd.Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") + tm.assert_series_equal(result, expected) + + result = grouped.sum(min_count=2) + expected = pd.DataFrame( + {"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx + ) + tm.assert_frame_equal(result, expected) From 8f1b9c9fdd355474494f98367ec7f6facbf452cc Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 18 Jul 2020 14:00:41 +0100 Subject: [PATCH 21/34] added comment --- pandas/core/groupby/groupby.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 93b779bc35d6a..bc9ef0b6e3456 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1628,6 +1628,8 @@ def describe(self, **kwargs): result = self.apply(lambda x: x.describe(**kwargs)) if self.axis == 1: return result.T + # GH 34656 self.apply() will return non-nuisance grouping columns, but + # we remove them from describe if as_index=True if self.as_index: with _group_selection_context(self): if self._group_selection is not None: From f422b7d5a962a06f4e8d66a18aad49bdfffc8fe5 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sun, 26 Jul 2020 22:52:02 +0100 Subject: [PATCH 22/34] fixed describe to work with duplicate cols --- pandas/core/groupby/groupby.py | 4 +- pandas/tests/groupby/test_function.py | 57 +++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bc9ef0b6e3456..05d7aa1379d95 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1633,8 +1633,8 @@ def describe(self, **kwargs): if self.as_index: with _group_selection_context(self): if self._group_selection is not None: - cols = result.columns.intersection(self._group_selection) - result = result.reindex(columns=cols) + group_cols = result.columns.difference(self._group_selection) + result = result.drop(columns=group_cols) return result.unstack() def resample(self, rule, *args, **kwargs): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index e693962e57ac3..1c5d08561ff90 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -992,6 +992,63 @@ def test_frame_describe_unstacked_format(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("as_index", [True, False]) +def test_describe_with_duplicate_output_column_names(as_index): + # GH 35314 + df = pd.DataFrame( + { + "a": [99, 99, 99, 88, 88, 88], + "b": [1, 2, 3, 4, 5, 6], + "c": [10, 20, 30, 40, 50, 60], + }, + columns=["a", "b", "b"], + ) + + expected = ( + pd.DataFrame.from_records( + [ + ("a", "count", 3.0, 3.0), + ("a", "mean", 88.0, 99.0), + ("a", "std", 0.0, 0.0), + ("a", "min", 88.0, 99.0), + ("a", "25%", 88.0, 99.0), + ("a", "50%", 88.0, 99.0), + ("a", "75%", 88.0, 99.0), + ("a", "max", 88.0, 99.0), + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ], + ) + .set_index([0, 1]) + .T + ) + expected.columns.names = [None, None] + expected.index = pd.Index([88, 99], name="a") + + if as_index: + expected = expected.drop(columns=["a"], level=0) + else: + expected = expected.reset_index(drop=True) + + result = df.groupby("a", as_index=as_index).describe() + + tm.assert_frame_equal(result, expected) + + def test_groupby_mean_no_overflow(): # Regression test for (#22487) df = pd.DataFrame( From 6bec0400c4fbae62b655bde70be41fdbaaa2c9fc Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sun, 26 Jul 2020 23:50:09 +0100 Subject: [PATCH 23/34] update comment --- pandas/tests/groupby/test_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 1c5d08561ff90..97e99337f5ffd 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -994,7 +994,7 @@ def test_frame_describe_unstacked_format(): @pytest.mark.parametrize("as_index", [True, False]) def test_describe_with_duplicate_output_column_names(as_index): - # GH 35314 + # GH #35314 df = pd.DataFrame( { "a": [99, 99, 99, 88, 88, 88], From 8cdd4cd34728c89ca28a2c1044ca2c6132b2efb1 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 27 Jul 2020 08:39:31 +0100 Subject: [PATCH 24/34] context manager in agg_general --- pandas/core/groupby/groupby.py | 177 +++++++++--------- .../tests/groupby/transform/test_transform.py | 3 - 2 files changed, 89 insertions(+), 91 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 05d7aa1379d95..a4c8ff1ec564e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -734,56 +734,58 @@ def pipe(self, func, *args, **kwargs): def _make_wrapper(self, name): assert name in self._apply_allowlist - self._set_group_selection() + with _group_selection_context(self): - # need to setup the selection - # as are not passed directly but in the grouper - f = getattr(self._obj_with_exclusions, name) - if not isinstance(f, types.MethodType): - return self.apply(lambda self: getattr(self, name)) + # need to setup the selection + # as are not passed directly but in the grouper + f = getattr(self._obj_with_exclusions, name) + if not isinstance(f, types.MethodType): + return self.apply(lambda self: getattr(self, name)) - f = getattr(type(self._obj_with_exclusions), name) - sig = inspect.signature(f) + f = getattr(type(self._obj_with_exclusions), name) + sig = inspect.signature(f) - def wrapper(*args, **kwargs): - # a little trickery for aggregation functions that need an axis - # argument - if "axis" in sig.parameters: - if kwargs.get("axis", None) is None: - kwargs["axis"] = self.axis + def wrapper(*args, **kwargs): + # a little trickery for aggregation functions that need an axis + # argument + if "axis" in sig.parameters: + if kwargs.get("axis", None) is None: + kwargs["axis"] = self.axis - def curried(x): - return f(x, *args, **kwargs) + def curried(x): + return f(x, *args, **kwargs) - # preserve the name so we can detect it when calling plot methods, - # to avoid duplicates - curried.__name__ = name + # preserve the name so we can detect it when calling plot methods, + # to avoid duplicates + curried.__name__ = name - # special case otherwise extra plots are created when catching the - # exception below - if name in base.plotting_methods: - return self.apply(curried) + # special case otherwise extra plots are created when catching the + # exception below + if name in base.plotting_methods: + return self.apply(curried) - try: - return self._python_apply_general(curried, self._obj_with_exclusions) - except TypeError as err: - if not re.search( - "reduction operation '.*' not allowed for this dtype", str(err) - ): - # We don't have a cython implementation - # TODO: is the above comment accurate? - raise + try: + return self._python_apply_general( + curried, self._obj_with_exclusions + ) + except TypeError as err: + if not re.search( + "reduction operation '.*' not allowed for this dtype", str(err) + ): + # We don't have a cython implementation + # TODO: is the above comment accurate? + raise - if self.obj.ndim == 1: - # this can be called recursively, so need to raise ValueError - raise ValueError + if self.obj.ndim == 1: + # this can be called recursively, so need to raise ValueError + raise ValueError - # GH#3688 try to operate item-by-item - result = self._aggregate_item_by_item(name, *args, **kwargs) - return result + # GH#3688 try to operate item-by-item + result = self._aggregate_item_by_item(name, *args, **kwargs) + return result - wrapper.__name__ = name - return wrapper + wrapper.__name__ = name + return wrapper def get_group(self, name, obj=None): """ @@ -829,8 +831,6 @@ def __iter__(self): ) def apply(self, func, *args, **kwargs): - self._reset_group_selection() - func = self._is_builtin_func(func) # this is needed so we don't try and wrap strings. If we could @@ -992,28 +992,31 @@ def _agg_general( alias: str, npfunc: Callable, ): - self._set_group_selection() - - # try a cython aggregation if we can - try: - return self._cython_agg_general( - how=alias, alt=npfunc, numeric_only=numeric_only, min_count=min_count, - ) - except DataError: - pass - except NotImplementedError as err: - if "function is not implemented for this dtype" in str( - err - ) or "category dtype not supported" in str(err): - # raised in _get_cython_function, in some cases can - # be trimmed by implementing cython funcs for more dtypes + # self._set_group_selection() + with _group_selection_context(self): + # try a cython aggregation if we can + try: + return self._cython_agg_general( + how=alias, + alt=npfunc, + numeric_only=numeric_only, + min_count=min_count, + ) + except DataError: pass - else: - raise + except NotImplementedError as err: + if "function is not implemented for this dtype" in str( + err + ) or "category dtype not supported" in str(err): + # raised in _get_cython_function, in some cases can + # be trimmed by implementing cython funcs for more dtypes + pass + else: + raise - # apply a non-cython aggregation - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - return result + # apply a non-cython aggregation + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + return result def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 @@ -1625,16 +1628,11 @@ def ohlc(self) -> DataFrame: @doc(DataFrame.describe) def describe(self, **kwargs): - result = self.apply(lambda x: x.describe(**kwargs)) - if self.axis == 1: - return result.T - # GH 34656 self.apply() will return non-nuisance grouping columns, but - # we remove them from describe if as_index=True - if self.as_index: - with _group_selection_context(self): - if self._group_selection is not None: - group_cols = result.columns.difference(self._group_selection) - result = result.drop(columns=group_cols) + with _group_selection_context(self): + result = self.apply(lambda x: x.describe(**kwargs)) + if self.axis == 1: + return result.T + return result.unstack() return result.unstack() def resample(self, rule, *args, **kwargs): @@ -1936,29 +1934,32 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra nth_values = list(set(n)) nth_array = np.array(nth_values, dtype=np.intp) - self._set_group_selection() + # self._set_group_selection() + with _group_selection_context(self): - mask_left = np.in1d(self._cumcount_array(), nth_array) - mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array) - mask = mask_left | mask_right + mask_left = np.in1d(self._cumcount_array(), nth_array) + mask_right = np.in1d( + self._cumcount_array(ascending=False) + 1, -nth_array + ) + mask = mask_left | mask_right - ids, _, _ = self.grouper.group_info + ids, _, _ = self.grouper.group_info - # Drop NA values in grouping - mask = mask & (ids != -1) + # Drop NA values in grouping + mask = mask & (ids != -1) - out = self._selected_obj[mask] - if not self.as_index: - return out + out = self._selected_obj[mask] + if not self.as_index: + return out - result_index = self.grouper.result_index - out.index = result_index[ids[mask]] + result_index = self.grouper.result_index + out.index = result_index[ids[mask]] - if not self.observed and isinstance(result_index, CategoricalIndex): - out = out.reindex(result_index) + if not self.observed and isinstance(result_index, CategoricalIndex): + out = out.reindex(result_index) - out = self._reindex_output(out) - return out.sort_index() if self.sort else out + out = self._reindex_output(out) + return out.sort_index() if self.sort else out # dropna is truthy if isinstance(n, valid_containers): diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 47e5fe2742075..cdaf27e214d80 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -743,9 +743,6 @@ def test_cython_transform_frame(op, args, targop): else: expected = gb.apply(targop) - if op == "shift" and type(gb_target.get("by")) is str: - expected = expected.drop(columns=gb_target.get("by")) - expected = expected.sort_index(axis=1) tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index(axis=1)) tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1)) From abe8be36adc17e9db4fbf5ce09574e79b5a8b457 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 27 Jul 2020 08:44:02 +0100 Subject: [PATCH 25/34] remove hashed out line --- pandas/core/groupby/groupby.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a4c8ff1ec564e..ef802ffb37a00 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1633,7 +1633,6 @@ def describe(self, **kwargs): if self.axis == 1: return result.T return result.unstack() - return result.unstack() def resample(self, rule, *args, **kwargs): """ @@ -1934,7 +1933,6 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra nth_values = list(set(n)) nth_array = np.array(nth_values, dtype=np.intp) - # self._set_group_selection() with _group_selection_context(self): mask_left = np.in1d(self._cumcount_array(), nth_array) From 7112cf86d78eaa75ef2ee665735b839f717e3735 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 27 Jul 2020 09:00:23 +0100 Subject: [PATCH 26/34] limited context manager in _make_wrapper --- pandas/core/groupby/groupby.py | 72 ++++++++++++++++------------------ 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ef802ffb37a00..ec09f132aa088 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -735,57 +735,54 @@ def _make_wrapper(self, name): assert name in self._apply_allowlist with _group_selection_context(self): - # need to setup the selection # as are not passed directly but in the grouper f = getattr(self._obj_with_exclusions, name) if not isinstance(f, types.MethodType): return self.apply(lambda self: getattr(self, name)) - f = getattr(type(self._obj_with_exclusions), name) - sig = inspect.signature(f) + f = getattr(type(self._obj_with_exclusions), name) + sig = inspect.signature(f) - def wrapper(*args, **kwargs): - # a little trickery for aggregation functions that need an axis - # argument - if "axis" in sig.parameters: - if kwargs.get("axis", None) is None: - kwargs["axis"] = self.axis + def wrapper(*args, **kwargs): + # a little trickery for aggregation functions that need an axis + # argument + if "axis" in sig.parameters: + if kwargs.get("axis", None) is None: + kwargs["axis"] = self.axis - def curried(x): - return f(x, *args, **kwargs) + def curried(x): + return f(x, *args, **kwargs) - # preserve the name so we can detect it when calling plot methods, - # to avoid duplicates - curried.__name__ = name + # preserve the name so we can detect it when calling plot methods, + # to avoid duplicates + curried.__name__ = name - # special case otherwise extra plots are created when catching the - # exception below - if name in base.plotting_methods: - return self.apply(curried) + # special case otherwise extra plots are created when catching the + # exception below + if name in base.plotting_methods: + return self.apply(curried) - try: - return self._python_apply_general( - curried, self._obj_with_exclusions - ) - except TypeError as err: - if not re.search( - "reduction operation '.*' not allowed for this dtype", str(err) - ): - # We don't have a cython implementation - # TODO: is the above comment accurate? - raise + try: + return self._python_apply_general(curried, self._obj_with_exclusions) + except TypeError as err: + if not re.search( + "reduction operation '.*' not allowed for this dtype", str(err) + ): + # We don't have a cython implementation + # TODO: is the above comment accurate? + raise - if self.obj.ndim == 1: - # this can be called recursively, so need to raise ValueError - raise ValueError + if self.obj.ndim == 1: + # this can be called recursively, so need to raise ValueError + raise ValueError - # GH#3688 try to operate item-by-item - result = self._aggregate_item_by_item(name, *args, **kwargs) - return result + # GH#3688 try to operate item-by-item + result = self._aggregate_item_by_item(name, *args, **kwargs) + return result - wrapper.__name__ = name - return wrapper + wrapper.__name__ = name + return wrapper def get_group(self, name, obj=None): """ @@ -992,7 +989,6 @@ def _agg_general( alias: str, npfunc: Callable, ): - # self._set_group_selection() with _group_selection_context(self): # try a cython aggregation if we can try: From 0c8b1440ca5d2e21aa2ae47b9043899e39f83b12 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 27 Jul 2020 10:12:36 +0100 Subject: [PATCH 27/34] removed unrelated test --- pandas/tests/groupby/test_function.py | 57 --------------------------- 1 file changed, 57 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 97e99337f5ffd..e693962e57ac3 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -992,63 +992,6 @@ def test_frame_describe_unstacked_format(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("as_index", [True, False]) -def test_describe_with_duplicate_output_column_names(as_index): - # GH #35314 - df = pd.DataFrame( - { - "a": [99, 99, 99, 88, 88, 88], - "b": [1, 2, 3, 4, 5, 6], - "c": [10, 20, 30, 40, 50, 60], - }, - columns=["a", "b", "b"], - ) - - expected = ( - pd.DataFrame.from_records( - [ - ("a", "count", 3.0, 3.0), - ("a", "mean", 88.0, 99.0), - ("a", "std", 0.0, 0.0), - ("a", "min", 88.0, 99.0), - ("a", "25%", 88.0, 99.0), - ("a", "50%", 88.0, 99.0), - ("a", "75%", 88.0, 99.0), - ("a", "max", 88.0, 99.0), - ("b", "count", 3.0, 3.0), - ("b", "mean", 5.0, 2.0), - ("b", "std", 1.0, 1.0), - ("b", "min", 4.0, 1.0), - ("b", "25%", 4.5, 1.5), - ("b", "50%", 5.0, 2.0), - ("b", "75%", 5.5, 2.5), - ("b", "max", 6.0, 3.0), - ("b", "count", 3.0, 3.0), - ("b", "mean", 5.0, 2.0), - ("b", "std", 1.0, 1.0), - ("b", "min", 4.0, 1.0), - ("b", "25%", 4.5, 1.5), - ("b", "50%", 5.0, 2.0), - ("b", "75%", 5.5, 2.5), - ("b", "max", 6.0, 3.0), - ], - ) - .set_index([0, 1]) - .T - ) - expected.columns.names = [None, None] - expected.index = pd.Index([88, 99], name="a") - - if as_index: - expected = expected.drop(columns=["a"], level=0) - else: - expected = expected.reset_index(drop=True) - - result = df.groupby("a", as_index=as_index).describe() - - tm.assert_frame_equal(result, expected) - - def test_groupby_mean_no_overflow(): # Regression test for (#22487) df = pd.DataFrame( From 755c8f0edd6fafbe972d57ade4334f48537746a8 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 27 Jul 2020 10:51:31 +0100 Subject: [PATCH 28/34] update comment --- pandas/tests/groupby/test_apply.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index ee6200d4e946b..d5da08f15b440 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1017,8 +1017,8 @@ def test_apply_with_timezones_aware(): def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): - # GH 34656 - # GH 34271 + # GH #34656 + # GH #34271 df = DataFrame( { "a": [99, 99, 99, 88, 88, 88], From b61695be939364a2acb57e8a7bf1a641d479695a Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 3 Aug 2020 21:16:06 +0100 Subject: [PATCH 29/34] whatsnew on v1.1.1 --- doc/source/whatsnew/v1.1.0.rst | 1 - doc/source/whatsnew/v1.1.1.rst | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 71521f82e991d..a49b29d691692 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1131,7 +1131,6 @@ Groupby/resample/rolling - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) - Bug in :meth:`DataFrame.groupby` raising an ``AttributeError`` when selecting a column and aggregating with ``as_index=False`` (:issue:`35246`). - Bug in :meth:`DataFrameGroupBy.first` and :meth:`DataFrameGroupBy.last` that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) -- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) Reshaping ^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 443589308ad4c..46bc8234f27ec 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -40,6 +40,10 @@ Bug fixes - +**Groupby/resample/rolling** + +- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) + **Indexing** - From 673a35b8ecc80697d2444ea84301649b293f7e76 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 3 Aug 2020 21:17:39 +0100 Subject: [PATCH 30/34] comment typo --- pandas/tests/groupby/test_apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index d5da08f15b440..b639862a3e12e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1032,7 +1032,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): index=pd.Index([88, 99], name="a"), ) - # Check output wehn no other methods are called before .apply() + # Check output when no other methods are called before .apply() grp = df.groupby(by="a") result = grp.apply(sum) tm.assert_frame_equal(result, expected) From 42f53dd644fd89edc977a8bb6a5f3e7ceb34667c Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 3 Aug 2020 23:06:36 +0100 Subject: [PATCH 31/34] amend comment to restart tests --- pandas/tests/groupby/test_apply.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index b639862a3e12e..3a373838b169b 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1017,8 +1017,8 @@ def test_apply_with_timezones_aware(): def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): - # GH #34656 - # GH #34271 + # GH 34656 + # GH 34271 df = DataFrame( { "a": [99, 99, 99, 88, 88, 88], From 18634a6064f1fce1c3310661e7bfeaeb851a6eaa Mon Sep 17 00:00:00 2001 From: smithto1 Date: Wed, 5 Aug 2020 15:25:13 +0100 Subject: [PATCH 32/34] whatsnew to 1.2.0 --- doc/source/whatsnew/v1.1.1.rst | 4 ---- doc/source/whatsnew/v1.2.0.rst | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 46bc8234f27ec..443589308ad4c 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -40,10 +40,6 @@ Bug fixes - -**Groupby/resample/rolling** - -- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) - **Indexing** - diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b16ca0a80c5b4..ba89e88299e89 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -134,7 +134,7 @@ Groupby/resample/rolling - - - +- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) Reshaping ^^^^^^^^^ From 1a0aa44501bdc6bea1d65aa4d1c0eb660df8c16d Mon Sep 17 00:00:00 2001 From: smithto1 Date: Wed, 5 Aug 2020 16:30:28 +0100 Subject: [PATCH 33/34] remove line that can't be tiggered by test --- pandas/core/groupby/groupby.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ec09f132aa088..4598548932fcf 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1007,8 +1007,6 @@ def _agg_general( # raised in _get_cython_function, in some cases can # be trimmed by implementing cython funcs for more dtypes pass - else: - raise # apply a non-cython aggregation result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) From b09e41e5e64422d9e682e294004850a0ca184dbd Mon Sep 17 00:00:00 2001 From: smithto1 Date: Wed, 5 Aug 2020 18:44:18 +0100 Subject: [PATCH 34/34] restart tests --- pandas/tests/groupby/test_apply.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index ca8a8004a2820..edf0be919fc41 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1017,8 +1017,8 @@ def test_apply_with_timezones_aware(): def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): - # GH 34656 - # GH 34271 + # GH #34656 + # GH #34271 df = DataFrame( { "a": [99, 99, 99, 88, 88, 88],