diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 970e3c4ac80f4..e2855cbc90425 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -10,6 +10,7 @@ import numpy as np from pandas._typing import ( + ArrayLike, FrameOrSeries, final, ) @@ -587,20 +588,23 @@ def indices(self): @property def codes(self) -> np.ndarray: - if self._passed_categorical: - # we make a CategoricalIndex out of the cat grouper - # preserving the categories / ordered attributes - cat = self.grouper - return cat.codes + if self._codes is not None: + # _codes is set in __init__ for MultiIndex cases + return self._codes - if self._codes is None: - self._make_codes() - # error: Incompatible return value type (got "Optional[ndarray]", - # expected "ndarray") - return self._codes # type: ignore[return-value] + return self._codes_and_uniques[0] + + @cache_readonly + def group_arraylike(self) -> ArrayLike: + """ + Analogous to result_index, but holding an ArrayLike to ensure + we can can retain ExtensionDtypes. + """ + return self._codes_and_uniques[1] @cache_readonly def result_index(self) -> Index: + # TODO: what's the difference between result_index vs group_index? if self.all_grouper is not None: group_idx = self.group_index assert isinstance(group_idx, CategoricalIndex) @@ -609,6 +613,14 @@ def result_index(self) -> Index: @cache_readonly def group_index(self) -> Index: + if self._group_index is not None: + # _group_index is set in __init__ for MultiIndex cases + return self._group_index + uniques = self.group_arraylike + return Index(uniques, name=self.name) + + @cache_readonly + def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes @@ -616,33 +628,22 @@ def group_index(self) -> Index: categories = cat.categories if self.observed: - codes = algorithms.unique1d(cat.codes) - codes = codes[codes != -1] + ucodes = algorithms.unique1d(cat.codes) + ucodes = ucodes[ucodes != -1] if self.sort or cat.ordered: - codes = np.sort(codes) + ucodes = np.sort(ucodes) else: - codes = np.arange(len(categories)) + ucodes = np.arange(len(categories)) - return CategoricalIndex( - Categorical.from_codes( - codes=codes, categories=categories, ordered=cat.ordered - ), - name=self.name, + uniques = Categorical.from_codes( + codes=ucodes, categories=categories, ordered=cat.ordered ) + return cat.codes, uniques - if self._group_index is None: - self._make_codes() - assert self._group_index is not None - return self._group_index - - def _make_codes(self) -> None: - if self._codes is not None and self._group_index is not None: - return - - # we have a list of groupers - if isinstance(self.grouper, ops.BaseGrouper): + elif isinstance(self.grouper, ops.BaseGrouper): + # we have a list of groupers codes = self.grouper.codes_info - uniques = self.grouper.result_index + uniques = self.grouper.result_arraylike else: # GH35667, replace dropna=False with na_sentinel=None if not self.dropna: @@ -652,9 +653,7 @@ def _make_codes(self) -> None: codes, uniques = algorithms.factorize( self.grouper, sort=self.sort, na_sentinel=na_sentinel ) - uniques = Index(uniques, name=self.name) - self._codes = codes - self._group_index = uniques + return codes, uniques @cache_readonly def groups(self) -> dict[Hashable, np.ndarray]: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b37467fb8cf11..746c6e0056064 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -907,6 +907,19 @@ def reconstructed_codes(self) -> list[np.ndarray]: ids, obs_ids, _ = self.group_info return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) + @cache_readonly + def result_arraylike(self) -> ArrayLike: + """ + Analogous to result_index, but returning an ndarray/ExtensionArray + allowing us to retain ExtensionDtypes not supported by Index. + """ + # TODO: once Index supports arbitrary EAs, this can be removed in favor + # of result_index + if len(self.groupings) == 1: + return self.groupings[0].group_arraylike + + return self.result_index._values + @cache_readonly def result_index(self) -> Index: if len(self.groupings) == 1: @@ -919,7 +932,7 @@ def result_index(self) -> Index: ) @final - def get_group_levels(self) -> list[Index]: + def get_group_levels(self) -> list[ArrayLike]: # Note: only called from _insert_inaxis_grouper_inplace, which # is only called for BaseGrouper, never for BinGrouper if len(self.groupings) == 1: