From 6a68c219dffa79521ecf9df0c985151a5f35d354 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 7 May 2021 14:53:05 -0700 Subject: [PATCH 1/6] REF: do less in Grouping.__init__ --- pandas/core/groupby/grouper.py | 93 ++++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 33 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index f1762a2535ff7..8b4d3fab8f950 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -439,6 +439,9 @@ class Grouping: * groups : dict of {group -> label_list} """ + _codes: np.ndarray | None = None + _group_index: Index | None = None + def __init__( self, index: Index, @@ -462,6 +465,8 @@ def __init__( self.in_axis = in_axis self.dropna = dropna + self._passed_categorical = False + # right place for this? if isinstance(grouper, (Series, Index)) and name is None: self.name = grouper.name @@ -472,20 +477,16 @@ def __init__( # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level - if level is not None: - if not isinstance(level, int): - if level not in index.names: - raise AssertionError(f"Level {level} not in index") - level = index.names.index(level) - + ilevel = self._ilevel + if ilevel is not None: if self.name is None: - self.name = index.names[level] + self.name = index.names[ilevel] ( - self.grouper, + self.grouper, # Index self._codes, self._group_index, - ) = index._get_grouper_for_level(self.grouper, level) + ) = index._get_grouper_for_level(self.grouper, ilevel) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes @@ -509,37 +510,24 @@ def __init__( if self.grouper is None and self.name is not None and self.obj is not None: self.grouper = self.obj[self.name] + if self.grouper.ndim > 1: + # i.e. DataFrame case reachable if columns non-unique + t = self.name or str(type(self.grouper)) + raise ValueError(f"Grouper for '{t}' not 1-dimensional") + elif isinstance(self.grouper, (list, tuple)): self.grouper = com.asarray_tuplesafe(self.grouper) # a passed Categorical elif is_categorical_dtype(self.grouper): + self._passed_categorical = True self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed ) - categories = self.grouper.categories - - # we make a CategoricalIndex out of the cat grouper - # preserving the categories / ordered attributes - self._codes = self.grouper.codes - if observed: - codes = algorithms.unique1d(self.grouper.codes) - codes = codes[codes != -1] - if sort or self.grouper.ordered: - codes = np.sort(codes) - else: - codes = np.arange(len(categories)) - - self._group_index = CategoricalIndex( - Categorical.from_codes( - codes=codes, categories=categories, ordered=self.grouper.ordered - ), - name=self.name, - ) # we are done - if isinstance(self.grouper, Grouping): + elif isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper # no level passed @@ -577,8 +565,20 @@ def __repr__(self) -> str: def __iter__(self): return iter(self.indices) - _codes: np.ndarray | None = None - _group_index: Index | None = None + @cache_readonly + def _ilevel(self) -> int | None: + """ + If necessary, converted index level name to index level position. + """ + level = self.level + if level is None: + return None + if not isinstance(level, int): + index = self.index + if level not in index.names: + raise AssertionError(f"Level {level} not in index") + return index.names.index(level) + return level @property def ngroups(self) -> int: @@ -595,6 +595,12 @@ def indices(self): @property def codes(self) -> np.ndarray: + if self._passed_categorical: + # we make a CategoricalIndex out of the cat grouper + # preserving the categories / ordered attributes + cat = self.grouper + return cat.codes + if self._codes is None: self._make_codes() # error: Incompatible return value type (got "Optional[ndarray]", @@ -605,12 +611,33 @@ def codes(self) -> np.ndarray: def result_index(self) -> Index: if self.all_grouper is not None: group_idx = self.group_index - assert isinstance(group_idx, CategoricalIndex) # set in __init__ + assert isinstance(group_idx, CategoricalIndex) return recode_from_groupby(self.all_grouper, self.sort, group_idx) return self.group_index - @property + @cache_readonly def group_index(self) -> Index: + if self._passed_categorical: + # we make a CategoricalIndex out of the cat grouper + # preserving the categories / ordered attributes + cat = self.grouper + categories = cat.categories + + if self.observed: + codes = algorithms.unique1d(cat.codes) + codes = codes[codes != -1] + if self.sort or cat.ordered: + codes = np.sort(codes) + else: + codes = np.arange(len(categories)) + + return CategoricalIndex( + Categorical.from_codes( + codes=codes, categories=categories, ordered=cat.ordered + ), + name=self.name, + ) + if self._group_index is None: self._make_codes() assert self._group_index is not None From 1b04e51f21f67f2101e248355d8a71638a5532e3 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 7 May 2021 15:45:14 -0700 Subject: [PATCH 2/6] mypy fixup --- pandas/core/groupby/generic.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 55e8578b2cef4..868dffb5e1b04 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -777,11 +777,7 @@ def apply_series_value_counts(): # multi-index components codes = self.grouper.reconstructed_codes codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] - # error: List item 0 has incompatible type "Union[ndarray, Any]"; - # expected "Index" - levels = [ping.group_index for ping in self.grouper.groupings] + [ - lev # type: ignore[list-item] - ] + levels = [ping.group_index for ping in self.grouper.groupings] + [lev] names = self.grouper.names + [self._selection_name] if dropna: From 19c5db1cf200ff44a78bcca72f32ee6484f86af3 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 17 May 2021 15:09:41 -0700 Subject: [PATCH 3/6] REF: remove _make_codes statefulness --- pandas/core/groupby/grouper.py | 66 ++++++++++++++++++---------------- pandas/core/groupby/ops.py | 15 +++++++- 2 files changed, 49 insertions(+), 32 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 4aac2630feb2c..c4f7c9a7020ba 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -10,6 +10,7 @@ import numpy as np from pandas._typing import ( + ArrayLike, FrameOrSeries, final, ) @@ -590,17 +591,25 @@ def codes(self) -> np.ndarray: if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes - cat = self.grouper - return cat.codes + return self._codes_and_uniques[0] - if self._codes is None: - self._make_codes() - # error: Incompatible return value type (got "Optional[ndarray]", - # expected "ndarray") - return self._codes # type: ignore[return-value] + elif self._codes is not None: + # set in __init__ + return self._codes + else: + return self._codes_and_uniques[0] + + @cache_readonly + def group_arraylike(self) -> ArrayLike: + """ + Analogous to result_index, but holding an ArrayLike to ensure + we can can retain ExtensionDtypes. + """ + return self._codes_and_uniques[1] @cache_readonly def result_index(self) -> Index: + # TODO: what's the difference between result_index vs group_index? if self.all_grouper is not None: group_idx = self.group_index assert isinstance(group_idx, CategoricalIndex) @@ -609,6 +618,14 @@ def result_index(self) -> Index: @cache_readonly def group_index(self) -> Index: + if self._group_index is not None: + # set in __init__ + return self._group_index # TODO: set this in codes_and_unique? + uniques = self.group_arraylike + return Index(uniques, name=self.name) + + @cache_readonly + def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes @@ -616,33 +633,22 @@ def group_index(self) -> Index: categories = cat.categories if self.observed: - codes = algorithms.unique1d(cat.codes) - codes = codes[codes != -1] + ucodes = algorithms.unique1d(cat.codes) + ucodes = ucodes[ucodes != -1] if self.sort or cat.ordered: - codes = np.sort(codes) + ucodes = np.sort(ucodes) else: - codes = np.arange(len(categories)) + ucodes = np.arange(len(categories)) - return CategoricalIndex( - Categorical.from_codes( - codes=codes, categories=categories, ordered=cat.ordered - ), - name=self.name, + uniques = Categorical.from_codes( + codes=ucodes, categories=categories, ordered=cat.ordered ) + return cat.codes, uniques - if self._group_index is None: - self._make_codes() - assert self._group_index is not None - return self._group_index - - def _make_codes(self) -> None: - if self._codes is not None and self._group_index is not None: - return - - # we have a list of groupers - if isinstance(self.grouper, ops.BaseGrouper): + elif isinstance(self.grouper, ops.BaseGrouper): + # we have a list of groupers codes = self.grouper.codes_info - uniques = self.grouper.result_index + uniques = self.grouper.result_arraylike else: # GH35667, replace dropna=False with na_sentinel=None if not self.dropna: @@ -652,9 +658,7 @@ def _make_codes(self) -> None: codes, uniques = algorithms.factorize( self.grouper, sort=self.sort, na_sentinel=na_sentinel ) - uniques = Index(uniques, name=self.name) - self._codes = codes - self._group_index = uniques + return codes, uniques @cache_readonly def groups(self) -> dict[Hashable, np.ndarray]: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b37467fb8cf11..746c6e0056064 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -907,6 +907,19 @@ def reconstructed_codes(self) -> list[np.ndarray]: ids, obs_ids, _ = self.group_info return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) + @cache_readonly + def result_arraylike(self) -> ArrayLike: + """ + Analogous to result_index, but returning an ndarray/ExtensionArray + allowing us to retain ExtensionDtypes not supported by Index. + """ + # TODO: once Index supports arbitrary EAs, this can be removed in favor + # of result_index + if len(self.groupings) == 1: + return self.groupings[0].group_arraylike + + return self.result_index._values + @cache_readonly def result_index(self) -> Index: if len(self.groupings) == 1: @@ -919,7 +932,7 @@ def result_index(self) -> Index: ) @final - def get_group_levels(self) -> list[Index]: + def get_group_levels(self) -> list[ArrayLike]: # Note: only called from _insert_inaxis_grouper_inplace, which # is only called for BaseGrouper, never for BinGrouper if len(self.groupings) == 1: From 5df3eafb08a68eaea28c1cd85a5a682aadb5c9e0 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 17 May 2021 15:51:43 -0700 Subject: [PATCH 4/6] simplify codes --- pandas/core/groupby/grouper.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index c4f7c9a7020ba..895f1c96f0253 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -588,16 +588,11 @@ def indices(self): @property def codes(self) -> np.ndarray: - if self._passed_categorical: - # we make a CategoricalIndex out of the cat grouper - # preserving the categories / ordered attributes - return self._codes_and_uniques[0] - - elif self._codes is not None: + if self._codes is not None: # set in __init__ return self._codes - else: - return self._codes_and_uniques[0] + + return self._codes_and_uniques[0] @cache_readonly def group_arraylike(self) -> ArrayLike: From b1122caa003c9f204f53731ef540b4bdb8eb802a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 17 May 2021 15:53:00 -0700 Subject: [PATCH 5/6] trim comment --- pandas/core/groupby/grouper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 895f1c96f0253..370ce6d2e46d9 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -615,7 +615,7 @@ def result_index(self) -> Index: def group_index(self) -> Index: if self._group_index is not None: # set in __init__ - return self._group_index # TODO: set this in codes_and_unique? + return self._group_index uniques = self.group_arraylike return Index(uniques, name=self.name) From 731bbe39cfb880a01ed466aaaa4328900a2a2afb Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 18 May 2021 10:27:51 -0700 Subject: [PATCH 6/6] clarify comment --- pandas/core/groupby/grouper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e5216d57185af..e2855cbc90425 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -589,7 +589,7 @@ def indices(self): @property def codes(self) -> np.ndarray: if self._codes is not None: - # set in __init__ + # _codes is set in __init__ for MultiIndex cases return self._codes return self._codes_and_uniques[0] @@ -614,7 +614,7 @@ def result_index(self) -> Index: @cache_readonly def group_index(self) -> Index: if self._group_index is not None: - # set in __init__ + # _group_index is set in __init__ for MultiIndex cases return self._group_index uniques = self.group_arraylike return Index(uniques, name=self.name)