From 611177a09adacc7c4e0f523c408a83aeee6e82c2 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 17 Dec 2021 16:15:26 +0100 Subject: [PATCH 1/2] BUG: groupby agg raising when column was selected twice --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/base.py | 2 +- pandas/tests/groupby/test_indexing.py | 16 ++++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index caf3a4281561f..8738b8546375f 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -794,6 +794,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.mean` failing with ``complex`` dtype (:issue:`43701`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly for the first row when ``center=True`` and index is decreasing (:issue:`43927`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` for centered datetimelike windows with uneven nanosecond (:issue:`43997`) +- Bug in :meth:`GroupBy.mean` raising ``KeyError`` when column was selected at least twice (:issue:`44924`) - Bug in :meth:`GroupBy.nth` failing on ``axis=1`` (:issue:`43926`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not respecting right bound on centered datetime-like windows, if the index contain duplicates (:issue:`3944`) - Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` when using a :class:`pandas.api.indexers.BaseIndexer` subclass that returned unequal start and end arrays would segfault instead of raising a ``ValueError`` (:issue:`44470`) diff --git a/pandas/core/base.py b/pandas/core/base.py index 9040414a8f35f..45a9b92d94b62 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -235,7 +235,7 @@ def __getitem__(self, key): raise IndexError(f"Column(s) {self._selection} already selected") if isinstance(key, (list, tuple, ABCSeries, ABCIndex, np.ndarray)): - if len(self.obj.columns.intersection(key)) != len(key): + if len(self.obj.columns.intersection(key)) != len(set(key)): bad_keys = list(set(key).difference(self.obj.columns)) raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}") return self._gotitem(list(key), ndim=2) diff --git a/pandas/tests/groupby/test_indexing.py b/pandas/tests/groupby/test_indexing.py index b9f71fd4ed96a..0734a3c0ca08b 100644 --- a/pandas/tests/groupby/test_indexing.py +++ b/pandas/tests/groupby/test_indexing.py @@ -285,3 +285,19 @@ def test_column_axis(column_group_df): expected = column_group_df.iloc[:, [1, 3]] tm.assert_frame_equal(result, expected) + + +def test_groupby_duplicated_columns(): + # GH#44924 + df = pd.DataFrame( + { + "A": [1, 2], + "B": [3, 3], + "C": ["G", "G"], + } + ) + result = df.groupby("C")[["A", "B", "A"]].mean() + expected = pd.DataFrame( + [[1.5, 3.0, 1.5]], columns=["A", "B", "A"], index=pd.Index(["G"], name="C") + ) + tm.assert_frame_equal(result, expected) From b43a2260316ebdf435ae37c53df1b7bdf511a9ef Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 17 Dec 2021 16:18:59 +0100 Subject: [PATCH 2/2] Parametrize test --- pandas/tests/groupby/test_indexing.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_indexing.py b/pandas/tests/groupby/test_indexing.py index 0734a3c0ca08b..aea659445801b 100644 --- a/pandas/tests/groupby/test_indexing.py +++ b/pandas/tests/groupby/test_indexing.py @@ -2,6 +2,7 @@ import random +import numpy as np import pytest import pandas as pd @@ -287,7 +288,8 @@ def test_column_axis(column_group_df): tm.assert_frame_equal(result, expected) -def test_groupby_duplicated_columns(): +@pytest.mark.parametrize("func", [list, pd.Index, pd.Series, np.array]) +def test_groupby_duplicated_columns(func): # GH#44924 df = pd.DataFrame( { @@ -296,7 +298,7 @@ def test_groupby_duplicated_columns(): "C": ["G", "G"], } ) - result = df.groupby("C")[["A", "B", "A"]].mean() + result = df.groupby("C")[func(["A", "B", "A"])].mean() expected = pd.DataFrame( [[1.5, 3.0, 1.5]], columns=["A", "B", "A"], index=pd.Index(["G"], name="C") )