From aa9c9e1a43ad843eeb3cd0366a4c119e5b9d073f Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 28 Dec 2022 13:59:14 -0500 Subject: [PATCH 01/18] REF: groupby Series selection with as_index=False --- pandas/core/apply.py | 78 +++++++++++++++++--------- pandas/core/base.py | 13 ++--- pandas/core/groupby/generic.py | 84 ++++++++++++++++------------ pandas/core/groupby/groupby.py | 63 ++++++++++++++------- pandas/core/groupby/ops.py | 2 +- pandas/core/series.py | 2 + pandas/tests/groupby/test_groupby.py | 2 + 7 files changed, 154 insertions(+), 90 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 02a9444dd4f97..d6de62676028d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -2,6 +2,7 @@ import abc from collections import defaultdict +from contextlib import nullcontext from functools import partial import inspect from typing import ( @@ -292,6 +293,10 @@ def agg_list_like(self) -> DataFrame | Series: ------- Result of aggregation. """ + from pandas.core.groupby.generic import ( + DataFrameGroupBy, + SeriesGroupBy, + ) from pandas.core.reshape.concat import concat obj = self.obj @@ -312,26 +317,35 @@ def agg_list_like(self) -> DataFrame | Series: results = [] keys = [] - # degenerate case - if selected_obj.ndim == 1: - for a in arg: - colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) - new_res = colg.aggregate(a) - results.append(new_res) + is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) + if is_groupby: + # When as_index=False, we combine all results using indices + # and adjust index after + context_manager = com.temp_setattr(obj, "as_index", True) + else: + context_manager = nullcontext() + with context_manager: + # degenerate case + if selected_obj.ndim == 1: - # make sure we find a good name - name = com.get_callable_name(a) or a - keys.append(name) + for a in arg: + colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) + new_res = colg.aggregate(a) + results.append(new_res) - # multiples - else: - indices = [] - for index, col in enumerate(selected_obj): - colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) - new_res = colg.aggregate(arg) - results.append(new_res) - indices.append(index) - keys = selected_obj.columns.take(indices) + # make sure we find a good name + name = com.get_callable_name(a) or a + keys.append(name) + + # multiples + else: + indices = [] + for index, col in enumerate(selected_obj): + colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) + new_res = colg.aggregate(arg) + results.append(new_res) + indices.append(index) + keys = selected_obj.columns.take(indices) try: concatenated = concat(results, keys=keys, axis=1, sort=False) @@ -366,6 +380,10 @@ def agg_dict_like(self) -> DataFrame | Series: Result of aggregation. """ from pandas import Index + from pandas.core.groupby.generic import ( + DataFrameGroupBy, + SeriesGroupBy, + ) from pandas.core.reshape.concat import concat obj = self.obj @@ -384,15 +402,23 @@ def agg_dict_like(self) -> DataFrame | Series: arg = self.normalize_dictlike_arg("agg", selected_obj, arg) - if selected_obj.ndim == 1: - # key only used for output - colg = obj._gotitem(selection, ndim=1) - results = {key: colg.agg(how) for key, how in arg.items()} + is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) + if is_groupby: + # When as_index=False, we combine all results using indices + # and adjust index after + context_manager = com.temp_setattr(obj, "as_index", True) else: - # key used for column selection and output - results = { - key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() - } + context_manager = nullcontext() + with context_manager: + if selected_obj.ndim == 1: + # key only used for output + colg = obj._gotitem(selection, ndim=1) + results = {key: colg.agg(how) for key, how in arg.items()} + else: + # key used for column selection and output + results = { + key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() + } # set the final keys keys = list(arg.keys()) diff --git a/pandas/core/base.py b/pandas/core/base.py index 826583fd26f5d..8559640c1858d 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -216,6 +216,9 @@ def _obj_with_exclusions(self): if self._selection is not None and isinstance(self.obj, ABCDataFrame): return self.obj[self._selection_list] + if isinstance(self.obj, ABCSeries): + return self.obj + if len(self.exclusions) > 0: # equivalent to `self.obj.drop(self.exclusions, axis=1) # but this avoids consolidating and making a copy @@ -235,17 +238,11 @@ def __getitem__(self, key): raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}") return self._gotitem(list(key), ndim=2) - elif not getattr(self, "as_index", False): - if key not in self.obj.columns: - raise KeyError(f"Column not found: {key}") - return self._gotitem(key, ndim=2) - else: if key not in self.obj: raise KeyError(f"Column not found: {key}") - subset = self.obj[key] - ndim = subset.ndim - return self._gotitem(key, ndim=ndim, subset=subset) + ndim = self.obj[key].ndim + return self._gotitem(key, ndim=ndim) def _gotitem(self, key, ndim: int, subset=None): """ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 905c1193713cc..09648e0d3e040 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -248,7 +248,11 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs ) index = self.grouper.result_index - return self.obj._constructor(result.ravel(), index=index, name=data.name) + result = self.obj._constructor(result.ravel(), index=index, name=data.name) + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + return result relabeling = func is None columns = None @@ -268,6 +272,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # columns is not narrowed by mypy from relabeling flag assert columns is not None # for mypy ret.columns = columns + if not self.as_index: + ret = self._insert_inaxis_grouper(ret) + ret.index = default_index(len(ret)) return ret else: @@ -287,23 +294,24 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # result is a dict whose keys are the elements of result_index index = self.grouper.result_index - return Series(result, index=index) + result = Series(result, index=index) + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + return result agg = aggregate def _aggregate_multiple_funcs(self, arg) -> DataFrame: if isinstance(arg, dict): - - # show the deprecation, but only if we - # have not shown a higher level one - # GH 15931 - raise SpecificationError("nested renamer is not supported") - - if any(isinstance(x, (tuple, list)) for x in arg): + if self.as_index: + # GH 15931 + raise SpecificationError("nested renamer is not supported") + else: + # GH#50684 - This accidentally worked in 1.x + arg = list(arg.items()) + elif any(isinstance(x, (tuple, list)) for x in arg): arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] - - # indicated column order - columns = next(zip(*arg)) else: # list of functions / function names columns = [] @@ -313,10 +321,13 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame: arg = zip(columns, arg) results: dict[base.OutputKey, DataFrame | Series] = {} - for idx, (name, func) in enumerate(arg): + with com.temp_setattr(self, "as_index", True): + # Combine results using the index, need to adjust index after + # if as_index=False (GH#50724) + for idx, (name, func) in enumerate(arg): - key = base.OutputKey(label=name, position=idx) - results[key] = self.aggregate(func) + key = base.OutputKey(label=name, position=idx) + results[key] = self.aggregate(func) if any(isinstance(x, DataFrame) for x in results.values()): from pandas import concat @@ -396,12 +407,18 @@ def _wrap_applied_output( ) if isinstance(result, Series): result.name = self.obj.name + if not self.as_index and not_indexed_same: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) return result else: # GH #6265 #24880 result = self.obj._constructor( data=values, index=self.grouper.result_index, name=self.obj.name ) + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): @@ -630,6 +647,9 @@ def nunique(self, dropna: bool = True) -> Series: res[ids[idx]] = out result = self.obj._constructor(res, index=ri, name=self.obj.name) + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) return self._reindex_output(result, fill_value=0) @doc(Series.describe) @@ -643,12 +663,11 @@ def value_counts( ascending: bool = False, bins=None, dropna: bool = True, - ) -> Series: + ) -> Series | DataFrame: if bins is None: result = self._value_counts( normalize=normalize, sort=sort, ascending=ascending, dropna=dropna ) - assert isinstance(result, Series) return result from pandas.core.reshape.merge import get_join_indexers @@ -786,7 +805,11 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray: if is_integer_dtype(out.dtype): out = ensure_int64(out) - return self.obj._constructor(out, index=mi, name=self.obj.name) + result = self.obj._constructor(out, index=mi, name=self.obj.name) + if not self.as_index: + result.name = "proportion" if normalize else "count" + result = result.reset_index() + return result def fillna( self, @@ -1274,7 +1297,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) result.columns = result.columns.droplevel(-1) if not self.as_index: - self._insert_inaxis_grouper_inplace(result) + result = self._insert_inaxis_grouper(result) result.index = default_index(len(result)) return result @@ -1386,7 +1409,7 @@ def _wrap_applied_output( return self.obj._constructor_sliced(values, index=key_index) else: result = self.obj._constructor(values, columns=[self._selection]) - self._insert_inaxis_grouper_inplace(result) + result = self._insert_inaxis_grouper(result) return result else: # values are Series @@ -1443,7 +1466,7 @@ def _wrap_applied_output_series( result = self.obj._constructor(stacked_values, index=index, columns=columns) if not self.as_index: - self._insert_inaxis_grouper_inplace(result) + result = self._insert_inaxis_grouper(result) return self._reindex_output(result) @@ -1774,7 +1797,9 @@ def _gotitem(self, key, ndim: int, subset=None): subset, level=self.level, grouper=self.grouper, + exclusions=self.exclusions, selection=key, + as_index=self.as_index, sort=self.sort, group_keys=self.group_keys, observed=self.observed, @@ -1790,19 +1815,6 @@ def _get_data_to_aggregate(self) -> Manager2D: else: return obj._mgr - def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None: - # zip in reverse so we can always insert at loc 0 - columns = result.columns - for name, lev, in_axis in zip( - reversed(self.grouper.names), - reversed(self.grouper.get_group_levels()), - reversed([grp.in_axis for grp in self.grouper.groupings]), - ): - # GH #28549 - # When using .apply(-), name will be in columns already - if in_axis and name not in columns: - result.insert(0, name, lev) - def _indexed_output_to_ndframe( self, output: Mapping[base.OutputKey, ArrayLike] ) -> DataFrame: @@ -1825,7 +1837,7 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: mgr.set_axis(1, index) result = self.obj._constructor(mgr) - self._insert_inaxis_grouper_inplace(result) + result = self._insert_inaxis_grouper(result) result = result._consolidate() else: index = self.grouper.result_index @@ -1918,7 +1930,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: if not self.as_index: results.index = default_index(len(results)) - self._insert_inaxis_grouper_inplace(results) + results = self._insert_inaxis_grouper(results) return results diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 431b23023b094..a7e3b4215625b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -123,6 +123,7 @@ class providing the base-class of operations. Index, MultiIndex, RangeIndex, + default_index, ) from pandas.core.internals.blocks import ensure_block_shape from pandas.core.series import Series @@ -910,8 +911,6 @@ def __init__( self.level = level if not as_index: - if not isinstance(obj, DataFrame): - raise TypeError("as_index=False only valid with DataFrame") if axis != 0: raise ValueError("as_index=False only valid for axis=0") @@ -1157,6 +1156,24 @@ def _set_result_index_ordered( return result + def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame: + if isinstance(result, Series): + result = result.to_frame() + + # zip in reverse so we can always insert at loc 0 + columns = result.columns + for name, lev, in_axis in zip( + reversed(self.grouper.names), + reversed(self.grouper.get_group_levels()), + reversed([grp.in_axis for grp in self.grouper.groupings]), + ): + # GH #28549 + # When using .apply(-), name will be in columns already + if in_axis and name not in columns: + result.insert(0, name, lev) + + return result + def _indexed_output_to_ndframe( self, result: Mapping[base.OutputKey, ArrayLike] ) -> Series | DataFrame: @@ -1193,7 +1210,7 @@ def _wrap_aggregated_output( if not self.as_index: # `not self.as_index` is only relevant for DataFrameGroupBy, # enforced in __init__ - self._insert_inaxis_grouper_inplace(result) + result = self._insert_inaxis_grouper(result) result = result._consolidate() index = Index(range(self.grouper.ngroups)) @@ -1613,7 +1630,10 @@ def array_func(values: ArrayLike) -> ArrayLike: res = self._wrap_agged_manager(new_mgr) if is_ser: - res.index = self.grouper.result_index + if self.as_index: + res.index = self.grouper.result_index + else: + res = self._insert_inaxis_grouper(res) return self._reindex_output(res) else: return res @@ -1887,7 +1907,10 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: result = self._wrap_agged_manager(new_mgr) if result.ndim == 1: - result.index = self.grouper.result_index + if self.as_index: + result.index = self.grouper.result_index + else: + result = self._insert_inaxis_grouper(result) return self._reindex_output(result, fill_value=0) @@ -2622,31 +2645,33 @@ def describe( exclude=None, ) -> NDFrameT: with self._group_selection_context(): - if len(self._selected_obj) == 0: - described = self._selected_obj.describe( + selected_obj = self._selected_obj + if len(selected_obj) == 0: + described = selected_obj.describe( percentiles=percentiles, include=include, exclude=exclude ) - if self._selected_obj.ndim == 1: + if selected_obj.ndim == 1: result = described else: result = described.unstack() return result.to_frame().T.iloc[:0] - result = self._python_apply_general( - lambda x: x.describe( - percentiles=percentiles, include=include, exclude=exclude - ), - self._selected_obj, - not_indexed_same=True, - ) + with com.temp_setattr(self, "as_index", True): + result = self._python_apply_general( + lambda x: x.describe( + percentiles=percentiles, include=include, exclude=exclude + ), + selected_obj, + not_indexed_same=True, + ) if self.axis == 1: return result.T # GH#49256 - properly handle the grouping column(s) - if self._selected_obj.ndim != 1 or self.as_index: - result = result.unstack() - if not self.as_index: - self._insert_inaxis_grouper_inplace(result) + result = result.unstack() + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ea902800cf7e0..f88236b2464c1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -946,7 +946,7 @@ def result_index(self) -> Index: @final def get_group_levels(self) -> list[ArrayLike]: - # Note: only called from _insert_inaxis_grouper_inplace, which + # Note: only called from _insert_inaxis_grouper, which # is only called for BaseGrouper, never for BinGrouper if len(self.groupings) == 1: return [self.groupings[0].group_arraylike] diff --git a/pandas/core/series.py b/pandas/core/series.py index 6b82d48f82ce7..ea6725fde5908 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1977,6 +1977,8 @@ def groupby( if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") + if not as_index: + raise TypeError("as_index=False only valid with DataFrame") axis = self._get_axis_number(axis) return SeriesGroupBy( diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3baf2d86010f7..c3ce3a1cc84c7 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -652,6 +652,8 @@ def test_groupby_as_index_select_column_sum_empty_df(): left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False) expected = DataFrame(columns=df.columns[:2], index=range(0)) + # GH#?? - Columns after selection shouldn't retain names + expected.columns.names = [None] tm.assert_frame_equal(left, expected) From 7d00d07bf36468e97a1da910362885ccfb42710b Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 14 Jan 2023 10:46:26 -0500 Subject: [PATCH 02/18] GH# --- pandas/tests/groupby/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c3ce3a1cc84c7..9b293f0f1669c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -652,7 +652,7 @@ def test_groupby_as_index_select_column_sum_empty_df(): left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False) expected = DataFrame(columns=df.columns[:2], index=range(0)) - # GH#?? - Columns after selection shouldn't retain names + # GH#50744 - Columns after selection shouldn't retain names expected.columns.names = [None] tm.assert_frame_equal(left, expected) From 41399ad544fbcf3ab281f9264b34b62ecd74141a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 16 Jan 2023 17:08:22 -0500 Subject: [PATCH 03/18] type-hinting fixes --- pandas/core/apply.py | 3 +++ pandas/core/groupby/generic.py | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index d6de62676028d..c28da1bc758cd 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -9,6 +9,7 @@ TYPE_CHECKING, Any, Callable, + ContextManager, DefaultDict, Dict, Hashable, @@ -318,6 +319,7 @@ def agg_list_like(self) -> DataFrame | Series: keys = [] is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) + context_manager: ContextManager if is_groupby: # When as_index=False, we combine all results using indices # and adjust index after @@ -403,6 +405,7 @@ def agg_dict_like(self) -> DataFrame | Series: arg = self.normalize_dictlike_arg("agg", selected_obj, arg) is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) + context_manager: ContextManager if is_groupby: # When as_index=False, we combine all results using indices # and adjust index after diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 09648e0d3e040..2340c36d14301 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -594,7 +594,7 @@ def true_and_notna(x) -> bool: filtered = self._apply_filter(indices, dropna) return filtered - def nunique(self, dropna: bool = True) -> Series: + def nunique(self, dropna: bool = True) -> Series | DataFrame: """ Return number of unique elements in the group. @@ -646,7 +646,9 @@ def nunique(self, dropna: bool = True) -> Series: # GH#21334s res[ids[idx]] = out - result = self.obj._constructor(res, index=ri, name=self.obj.name) + result: Series | DataFrame = self.obj._constructor( + res, index=ri, name=self.obj.name + ) if not self.as_index: result = self._insert_inaxis_grouper(result) result.index = default_index(len(result)) From c26957d49b4b64eaad1201fc8678e38be390d859 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 16 Jan 2023 23:33:29 -0500 Subject: [PATCH 04/18] WIP --- pandas/core/groupby/groupby.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a7e3b4215625b..7ea2139e4ba50 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -939,6 +939,19 @@ def __init__( self.grouper = grouper self.exclusions = frozenset(exclusions) if exclusions else frozenset() + with self._group_selection_context(): + so = self._selected_obj + # if self.ndim == 2 and so.ndim == 1: + # so = so.to_frame() + owe = self._obj_with_exclusions + import pandas._testing as tm + print('---') + print(owe.head()) + print('---') + print(so.head()) + print('---') + tm.assert_equal(owe, so) + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) From 1860c4dc38fe447895f8a36b1de657e043784d84 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 17 Jan 2023 19:53:58 -0500 Subject: [PATCH 05/18] WIP --- pandas/core/groupby/groupby.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7ea2139e4ba50..bd0c92df7c2d6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -726,7 +726,7 @@ def _selected_obj(self): if self._selection is None or isinstance(self.obj, Series): if self._group_selection is not None: - return self.obj[self._group_selection] + return self.obj._take(self._group_selection, axis=1, convert_indices=False) return self.obj else: return self.obj[self._selection] @@ -939,18 +939,18 @@ def __init__( self.grouper = grouper self.exclusions = frozenset(exclusions) if exclusions else frozenset() - with self._group_selection_context(): - so = self._selected_obj - # if self.ndim == 2 and so.ndim == 1: - # so = so.to_frame() - owe = self._obj_with_exclusions - import pandas._testing as tm - print('---') - print(owe.head()) - print('---') - print(so.head()) - print('---') - tm.assert_equal(owe, so) + # with self._group_selection_context(): + # so = self._selected_obj + # # if self.ndim == 2 and so.ndim == 1: + # # so = so.to_frame() + # owe = self._obj_with_exclusions + # import pandas._testing as tm + # print('---') + # print(owe.head()) + # print('---') + # print(so.head()) + # print('---') + # tm.assert_equal(owe, so) def __getattr__(self, attr: str): if attr in self._internal_names_set: @@ -1037,6 +1037,7 @@ def _set_group_selection(self) -> None: # GH12839 clear selected obj cache when group selection changes ax = self.obj._info_axis self._group_selection = ax.difference(Index(groupers), sort=False).tolist() + self._group_selection = [idx for idx, label in enumerate(ax) if label not in groupers] self._reset_cache("_selected_obj") @final From e42e222f6c8f70c567dcae50ec91438228808621 Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 18 Jan 2023 16:33:44 -0500 Subject: [PATCH 06/18] WIP --- pandas/tests/groupby/test_function.py | 21 +++++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 11 +++++++++++ 2 files changed, 32 insertions(+) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 1e16e353cc1a4..8611b928b5a40 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1254,6 +1254,27 @@ def test_describe_with_duplicate_output_column_names(as_index, keys): tm.assert_frame_equal(result, expected) +def test_describe_duplicate_columns(): + # GH#50806 + df = DataFrame([[0, 1, 2, 3]]) + df.columns = [0, 1, 2, 0] + gb = df.groupby(df[1]) + result = gb.describe(percentiles=[]) + + columns = ["count", "mean", "std", "min", "50%", "max"] + frames = [ + DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) + for val in (0.0, 2.0, 3.0) + ] + expected = pd.concat(frames, axis=1) + expected.columns = MultiIndex( + levels=[[0, 2], columns], + codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], + ) + expected.index.names = [1] + tm.assert_frame_equal(result, expected) + + def test_groupby_mean_no_overflow(): # Regression test for (#22487) df = DataFrame( diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a1c1930c2e11b..ded764ad7a613 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2867,3 +2867,14 @@ def test_groupby_method_drop_na(method): else: expected = DataFrame({"A": ["a", "b", "c"], "B": [0, 2, 4]}, index=[0, 2, 4]) tm.assert_frame_equal(result, expected) + + +def test_selected_obj_duplicate_columns(): + # GH#50806 + df = DataFrame([[0, 1, 2, 3]]) + df.columns = [0, 1, 2, 0] + gb = df.groupby(df[1]) + with gb._group_selection_context(): + result = gb._selected_obj + expected = df.take([0, 2, 3], axis=1) + tm.assert_frame_equal(result, expected) From 0bdf009cecb6b52b406c0482cfa969a8548a2523 Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 28 Dec 2022 13:59:14 -0500 Subject: [PATCH 07/18] BUG: groupby.describe on a frame with duplicate column names --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/groupby/groupby.py | 7 +++- pandas/tests/groupby/test_function.py | 47 +++++++++++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 11 +++++++ 4 files changed, 65 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 614832c5acd1b..715abb7de4eab 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1063,6 +1063,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` would raise incorrectly when grouper had ``axis=1`` for ``"idxmin"`` and ``"idxmax"`` arguments (:issue:`45986`) - Bug in :class:`.DataFrameGroupBy` would raise when used with an empty DataFrame, categorical grouper, and ``dropna=False`` (:issue:`50634`) - Bug in :meth:`.SeriesGroupBy.value_counts` did not respect ``sort=False`` (:issue:`50482`) +- Bug in :meth:`.DataFrameGroupBy.describe` produced incorrect results when data had duplicate columns (:issue:`50806`) - Reshaping diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c15948ce877a8..52bf337a86c92 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -726,7 +726,9 @@ def _selected_obj(self): if self._selection is None or isinstance(self.obj, Series): if self._group_selection is not None: - return self.obj[self._group_selection] + return self.obj._take( + self._group_selection, axis=1, convert_indices=False + ) return self.obj else: return self.obj[self._selection] @@ -1024,6 +1026,9 @@ def _set_group_selection(self) -> None: # GH12839 clear selected obj cache when group selection changes ax = self.obj._info_axis self._group_selection = ax.difference(Index(groupers), sort=False).tolist() + self._group_selection = [ + idx for idx, label in enumerate(ax) if label not in groupers + ] self._reset_cache("_selected_obj") @final diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 1e16e353cc1a4..c077fb1d257a5 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1254,6 +1254,27 @@ def test_describe_with_duplicate_output_column_names(as_index, keys): tm.assert_frame_equal(result, expected) +def test_describe_duplicate_columns(): + # GH#50806 + df = DataFrame([[0, 1, 2, 3]]) + df.columns = [0, 1, 2, 0] + gb = df.groupby(df[1]) + result = gb.describe(percentiles=[]) + + columns = ["count", "mean", "std", "min", "50%", "max"] + frames = [ + DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) + for val in (0.0, 2.0, 3.0) + ] + expected = pd.concat(frames, axis=1) + expected.columns = MultiIndex( + levels=[[0, 2], columns], + codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], + ) + expected.index.names = [1] + tm.assert_frame_equal(result, expected) + + def test_groupby_mean_no_overflow(): # Regression test for (#22487) df = DataFrame( @@ -1594,3 +1615,29 @@ def test_multiindex_group_all_columns_when_empty(groupby_func): result = method(*args).index expected = df.index tm.assert_index_equal(result, expected) + + +def test_duplicate_columns(request, groupby_func, as_index): + # GH#50806 + if groupby_func == "corrwith": + msg = "GH#50845 - corrwith fails when there are duplicate columns" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb")) + args = get_groupby_method_args(groupby_func, df) + gb = df.groupby("a", as_index=as_index) + result = getattr(gb, groupby_func)(*args) + + if groupby_func in ("size", "ngroup", "cumcount"): + expected = getattr( + df.take([0, 1], axis=1).groupby("a", as_index=as_index), groupby_func + )(*args) + tm.assert_equal(result, expected) + else: + expected_df = df.copy() + expected_df.columns = ["a", "b", "c"] + expected_args = get_groupby_method_args(groupby_func, expected_df) + expected = getattr(expected_df.groupby("a", as_index=as_index), groupby_func)( + *expected_args + ) + expected = expected.rename(columns={"c": "b"}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a1c1930c2e11b..ded764ad7a613 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2867,3 +2867,14 @@ def test_groupby_method_drop_na(method): else: expected = DataFrame({"A": ["a", "b", "c"], "B": [0, 2, 4]}, index=[0, 2, 4]) tm.assert_frame_equal(result, expected) + + +def test_selected_obj_duplicate_columns(): + # GH#50806 + df = DataFrame([[0, 1, 2, 3]]) + df.columns = [0, 1, 2, 0] + gb = df.groupby(df[1]) + with gb._group_selection_context(): + result = gb._selected_obj + expected = df.take([0, 2, 3], axis=1) + tm.assert_frame_equal(result, expected) From 185e4f8e6006a6cc9404fd608a991c48cfc1f4d1 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 18 Jan 2023 17:31:40 -0500 Subject: [PATCH 08/18] cleanup --- pandas/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 52bf337a86c92..37e75a984c92c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1025,7 +1025,6 @@ def _set_group_selection(self) -> None: if len(groupers): # GH12839 clear selected obj cache when group selection changes ax = self.obj._info_axis - self._group_selection = ax.difference(Index(groupers), sort=False).tolist() self._group_selection = [ idx for idx, label in enumerate(ax) if label not in groupers ] From d2b965ff426b4d467e70d815618aaf0c2e9c3ac7 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 19 Jan 2023 16:22:33 -0500 Subject: [PATCH 09/18] test fixup --- pandas/tests/groupby/test_groupby_dropna.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 5418a2a60dc80..3cee8baeb6e5b 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -544,9 +544,12 @@ def test_categorical_reducers( gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() - expected["x"] = expected["x"].replace(4, None) + # Workaround since we can't use replace (GH#50872) + mask = expected["x"] == 4 + expected["x"] = expected["x"].mask(mask, None).cat.remove_categories([4]) if index_kind == "multi": - expected["x2"] = expected["x2"].replace(4, None) + mask = expected["x2"] == 4 + expected["x2"] = expected["x2"].mask(mask, None).cat.remove_categories([4]) if as_index: if index_kind == "multi": expected = expected.set_index(["x", "x2"]) @@ -578,6 +581,8 @@ def test_categorical_reducers( result = getattr(gb_keepna, reduction_func)(*args) # size will return a Series, others are DataFrame + print(result.index.dtype) + print(expected.index.dtype) tm.assert_equal(result, expected) From 932e3c87b7c50906d52511c0fb01165031f56560 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 19 Jan 2023 16:26:21 -0500 Subject: [PATCH 10/18] Fix type-hint for _group_selection --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 37e75a984c92c..37041f515c0d8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -610,7 +610,7 @@ def f(self): class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): - _group_selection: IndexLabel | None = None + _group_selection: list[int] | None = None _hidden_attrs = PandasObject._hidden_attrs | { "as_index", "axis", From eeea6fcc41691741103d8820c8d2c80c8712be39 Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 19 Jan 2023 22:26:01 -0500 Subject: [PATCH 11/18] Merge branch 'groupby_select_obj_dup_cols' of https://github.com/rhshadrach/pandas into groupby_select_obj_dup_cols # Conflicts: # pandas/core/groupby/groupby.py --- pandas/core/groupby/groupby.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b7d52dd207019..37041f515c0d8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -941,19 +941,6 @@ def __init__( self.grouper = grouper self.exclusions = frozenset(exclusions) if exclusions else frozenset() - # with self._group_selection_context(): - # so = self._selected_obj - # # if self.ndim == 2 and so.ndim == 1: - # # so = so.to_frame() - # owe = self._obj_with_exclusions - # import pandas._testing as tm - # print('---') - # print(owe.head()) - # print('---') - # print(so.head()) - # print('---') - # tm.assert_equal(owe, so) - def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) From 83f12b77601f956e16efce8305947127d0f4c593 Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 19 Jan 2023 23:06:20 -0500 Subject: [PATCH 12/18] Speedup --- pandas/core/groupby/groupby.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 37041f515c0d8..4ae205ba2d85b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -610,7 +610,7 @@ def f(self): class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): - _group_selection: list[int] | None = None + _group_selection: np.ndarray | None = None _hidden_attrs = PandasObject._hidden_attrs | { "as_index", "axis", @@ -1025,9 +1025,15 @@ def _set_group_selection(self) -> None: if len(groupers): # GH12839 clear selected obj cache when group selection changes ax = self.obj._info_axis - self._group_selection = [ - idx for idx, label in enumerate(ax) if label not in groupers - ] + if len(ax) < 2000: + # Determined experimentally, after 2000 this is slower than + # the NumPy version + self._group_selection = np.array( + [idx for idx, label in enumerate(ax) if label not in groupers] + ) + else: + indexer = ax.get_indexer_for(list(groupers)) + self._group_selection = np.delete(np.arange(len(ax)), indexer) self._reset_cache("_selected_obj") @final From c37a1ababc94d90aacf4cddc7bd05f4fc830dad8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 20 Jan 2023 16:46:07 -0500 Subject: [PATCH 13/18] refinement --- pandas/core/groupby/groupby.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4ae205ba2d85b..05aa0095247a0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1025,9 +1025,8 @@ def _set_group_selection(self) -> None: if len(groupers): # GH12839 clear selected obj cache when group selection changes ax = self.obj._info_axis - if len(ax) < 2000: - # Determined experimentally, after 2000 this is slower than - # the NumPy version + if len(ax) < 1000: + # Determined experimentally, larger is slower than the NumPy version self._group_selection = np.array( [idx for idx, label in enumerate(ax) if label not in groupers] ) From 4dafe5a3258bbe5a63f8e02d7b8e53946d166a6b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 25 Jan 2023 16:22:44 -0500 Subject: [PATCH 14/18] cleanup, faster implementation --- pandas/core/groupby/groupby.py | 15 ++++----------- pandas/tests/groupby/test_groupby_dropna.py | 9 ++------- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0717fc1cc7765..c6855944af2b5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1020,19 +1020,12 @@ def _set_group_selection(self) -> None: ): return - groupers = self.exclusions - - if len(groupers): + exclusions = self.exclusions + if len(exclusions): # GH12839 clear selected obj cache when group selection changes ax = self.obj._info_axis - if len(ax) < 1000: - # Determined experimentally, larger is slower than the NumPy version - self._group_selection = np.array( - [idx for idx, label in enumerate(ax) if label not in groupers] - ) - else: - indexer = ax.get_indexer_for(list(groupers)) - self._group_selection = np.delete(np.arange(len(ax)), indexer) + # ilocs of ax that are not in the exclusions + self._group_selection = np.arange(len(ax))[~ax.isin(exclusions)] self._reset_cache("_selected_obj") @final diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 3cee8baeb6e5b..5418a2a60dc80 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -544,12 +544,9 @@ def test_categorical_reducers( gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() - # Workaround since we can't use replace (GH#50872) - mask = expected["x"] == 4 - expected["x"] = expected["x"].mask(mask, None).cat.remove_categories([4]) + expected["x"] = expected["x"].replace(4, None) if index_kind == "multi": - mask = expected["x2"] == 4 - expected["x2"] = expected["x2"].mask(mask, None).cat.remove_categories([4]) + expected["x2"] = expected["x2"].replace(4, None) if as_index: if index_kind == "multi": expected = expected.set_index(["x", "x2"]) @@ -581,8 +578,6 @@ def test_categorical_reducers( result = getattr(gb_keepna, reduction_func)(*args) # size will return a Series, others are DataFrame - print(result.index.dtype) - print(expected.index.dtype) tm.assert_equal(result, expected) From d5df78cb9ebc61dc2892b90ca1b0193b5d82654b Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 30 Jan 2023 22:51:28 -0500 Subject: [PATCH 15/18] Make group_selection a Boolean flag --- pandas/core/groupby/groupby.py | 38 +++++++--------------------------- 1 file changed, 8 insertions(+), 30 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4ae205ba2d85b..84211888124a5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -610,7 +610,7 @@ def f(self): class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): - _group_selection: np.ndarray | None = None + _group_selection: bool = False _hidden_attrs = PandasObject._hidden_attrs | { "as_index", "axis", @@ -725,10 +725,8 @@ def _selected_obj(self): # Note: _selected_obj is always just `self.obj` for SeriesGroupBy if self._selection is None or isinstance(self.obj, Series): - if self._group_selection is not None: - return self.obj._take( - self._group_selection, axis=1, convert_indices=False - ) + if self._group_selection: + return self._obj_with_exclusions return self.obj else: return self.obj[self._selection] @@ -1011,30 +1009,10 @@ def _set_group_selection(self) -> None: NOTE: this should be paired with a call to _reset_group_selection """ - # This is a no-op for SeriesGroupBy - grp = self.grouper - if ( - grp.groupings is None - or self.obj.ndim == 1 - or self._group_selection is not None - ): + if self.grouper.groupings is None or self.obj.ndim == 1: return - - groupers = self.exclusions - - if len(groupers): - # GH12839 clear selected obj cache when group selection changes - ax = self.obj._info_axis - if len(ax) < 2000: - # Determined experimentally, after 2000 this is slower than - # the NumPy version - self._group_selection = np.array( - [idx for idx, label in enumerate(ax) if label not in groupers] - ) - else: - indexer = ax.get_indexer_for(list(groupers)) - self._group_selection = np.delete(np.arange(len(ax)), indexer) - self._reset_cache("_selected_obj") + self._group_selection = True + self._reset_cache("_selected_obj") @final def _reset_group_selection(self) -> None: @@ -1044,9 +1022,9 @@ def _reset_group_selection(self) -> None: Used for methods needing to return info on each group regardless of whether a group selection was previously set. """ - if self._group_selection is not None: + if self._group_selection: # GH12839 clear cached selection too when changing group selection - self._group_selection = None + self._group_selection = False self._reset_cache("_selected_obj") @contextmanager From 8d6df54006c60ead171be06745116c15076cf5f3 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 30 Jan 2023 22:55:17 -0500 Subject: [PATCH 16/18] Avoid resetting cache --- pandas/core/groupby/groupby.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ae5175cc19e39..f1f2a16d47dd4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1009,7 +1009,8 @@ def _set_group_selection(self) -> None: NOTE: this should be paired with a call to _reset_group_selection """ - if self.grouper.groupings is None or self.obj.ndim == 1: + grp = self.grouper + if grp.groupings is None or self.obj.ndim == 1 or self._group_selection: return self._group_selection = True self._reset_cache("_selected_obj") From 62540af2c466a28e013690692d370bc8ab1d3c73 Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 31 Jan 2023 20:19:10 -0500 Subject: [PATCH 17/18] Improve test --- pandas/tests/groupby/test_function.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index c077fb1d257a5..d00dde7179df7 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1627,17 +1627,10 @@ def test_duplicate_columns(request, groupby_func, as_index): gb = df.groupby("a", as_index=as_index) result = getattr(gb, groupby_func)(*args) - if groupby_func in ("size", "ngroup", "cumcount"): - expected = getattr( - df.take([0, 1], axis=1).groupby("a", as_index=as_index), groupby_func - )(*args) - tm.assert_equal(result, expected) - else: - expected_df = df.copy() - expected_df.columns = ["a", "b", "c"] - expected_args = get_groupby_method_args(groupby_func, expected_df) - expected = getattr(expected_df.groupby("a", as_index=as_index), groupby_func)( - *expected_args - ) + expected_df = df.set_axis(["a", "b", "c"], axis=1) + expected_args = get_groupby_method_args(groupby_func, expected_df) + expected_gb = expected_df.groupby("a", as_index=as_index) + expected = getattr(expected_gb, groupby_func)(*expected_args) + if groupby_func not in ("size", "ngroup", "cumcount"): expected = expected.rename(columns={"c": "b"}) - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) From 359d7fffb7ae4ca5eebecc0fa75014c00723ec5d Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 2 Feb 2023 22:57:11 -0500 Subject: [PATCH 18/18] Rework test --- pandas/tests/groupby/test_groupby.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6771e5566b2f9..d7b015fa7104a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2830,12 +2830,11 @@ def test_groupby_reduce_period(): tm.assert_series_equal(res, expected) -def test_selected_obj_duplicate_columns(): +def test_obj_with_exclusions_duplicate_columns(): # GH#50806 df = DataFrame([[0, 1, 2, 3]]) df.columns = [0, 1, 2, 0] gb = df.groupby(df[1]) - with gb._group_selection_context(): - result = gb._selected_obj + result = gb._obj_with_exclusions expected = df.take([0, 2, 3], axis=1) tm.assert_frame_equal(result, expected)