From 6a68c219dffa79521ecf9df0c985151a5f35d354 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 7 May 2021 14:53:05 -0700
Subject: [PATCH 1/6] REF: do less in Grouping.__init__

---
 pandas/core/groupby/grouper.py | 93 ++++++++++++++++++++++------------
 1 file changed, 60 insertions(+), 33 deletions(-)

diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index f1762a2535ff7..8b4d3fab8f950 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -439,6 +439,9 @@ class Grouping:
       * groups : dict of {group -> label_list}
     """
 
+    _codes: np.ndarray | None = None
+    _group_index: Index | None = None
+
     def __init__(
         self,
         index: Index,
@@ -462,6 +465,8 @@ def __init__(
         self.in_axis = in_axis
         self.dropna = dropna
 
+        self._passed_categorical = False
+
         # right place for this?
         if isinstance(grouper, (Series, Index)) and name is None:
             self.name = grouper.name
@@ -472,20 +477,16 @@ def __init__(
         # we have a single grouper which may be a myriad of things,
         # some of which are dependent on the passing in level
 
-        if level is not None:
-            if not isinstance(level, int):
-                if level not in index.names:
-                    raise AssertionError(f"Level {level} not in index")
-                level = index.names.index(level)
-
+        ilevel = self._ilevel
+        if ilevel is not None:
             if self.name is None:
-                self.name = index.names[level]
+                self.name = index.names[ilevel]
 
             (
-                self.grouper,
+                self.grouper,  # Index
                 self._codes,
                 self._group_index,
-            ) = index._get_grouper_for_level(self.grouper, level)
+            ) = index._get_grouper_for_level(self.grouper, ilevel)
 
         # a passed Grouper like, directly get the grouper in the same way
         # as single grouper groupby, use the group_info to get codes
@@ -509,37 +510,24 @@ def __init__(
             if self.grouper is None and self.name is not None and self.obj is not None:
                 self.grouper = self.obj[self.name]
 
+                if self.grouper.ndim > 1:
+                    # i.e. DataFrame case reachable if columns non-unique
+                    t = self.name or str(type(self.grouper))
+                    raise ValueError(f"Grouper for '{t}' not 1-dimensional")
+
             elif isinstance(self.grouper, (list, tuple)):
                 self.grouper = com.asarray_tuplesafe(self.grouper)
 
             # a passed Categorical
             elif is_categorical_dtype(self.grouper):
+                self._passed_categorical = True
 
                 self.grouper, self.all_grouper = recode_for_groupby(
                     self.grouper, self.sort, observed
                 )
-                categories = self.grouper.categories
-
-                # we make a CategoricalIndex out of the cat grouper
-                # preserving the categories / ordered attributes
-                self._codes = self.grouper.codes
-                if observed:
-                    codes = algorithms.unique1d(self.grouper.codes)
-                    codes = codes[codes != -1]
-                    if sort or self.grouper.ordered:
-                        codes = np.sort(codes)
-                else:
-                    codes = np.arange(len(categories))
-
-                self._group_index = CategoricalIndex(
-                    Categorical.from_codes(
-                        codes=codes, categories=categories, ordered=self.grouper.ordered
-                    ),
-                    name=self.name,
-                )
 
             # we are done
-            if isinstance(self.grouper, Grouping):
+            elif isinstance(self.grouper, Grouping):
                 self.grouper = self.grouper.grouper
 
             # no level passed
@@ -577,8 +565,20 @@ def __repr__(self) -> str:
     def __iter__(self):
         return iter(self.indices)
 
-    _codes: np.ndarray | None = None
-    _group_index: Index | None = None
+    @cache_readonly
+    def _ilevel(self) -> int | None:
+        """
+        If necessary, converted index level name to index level position.
+        """
+        level = self.level
+        if level is None:
+            return None
+        if not isinstance(level, int):
+            index = self.index
+            if level not in index.names:
+                raise AssertionError(f"Level {level} not in index")
+            return index.names.index(level)
+        return level
 
     @property
     def ngroups(self) -> int:
@@ -595,6 +595,12 @@ def indices(self):
 
     @property
     def codes(self) -> np.ndarray:
+        if self._passed_categorical:
+            # we make a CategoricalIndex out of the cat grouper
+            # preserving the categories / ordered attributes
+            cat = self.grouper
+            return cat.codes
+
         if self._codes is None:
             self._make_codes()
         # error: Incompatible return value type (got "Optional[ndarray]",
@@ -605,12 +611,33 @@ def codes(self) -> np.ndarray:
     def result_index(self) -> Index:
         if self.all_grouper is not None:
             group_idx = self.group_index
-            assert isinstance(group_idx, CategoricalIndex)  # set in __init__
+            assert isinstance(group_idx, CategoricalIndex)
             return recode_from_groupby(self.all_grouper, self.sort, group_idx)
         return self.group_index
 
-    @property
+    @cache_readonly
     def group_index(self) -> Index:
+        if self._passed_categorical:
+            # we make a CategoricalIndex out of the cat grouper
+            # preserving the categories / ordered attributes
+            cat = self.grouper
+            categories = cat.categories
+
+            if self.observed:
+                codes = algorithms.unique1d(cat.codes)
+                codes = codes[codes != -1]
+                if self.sort or cat.ordered:
+                    codes = np.sort(codes)
+            else:
+                codes = np.arange(len(categories))
+
+            return CategoricalIndex(
+                Categorical.from_codes(
+                    codes=codes, categories=categories, ordered=cat.ordered
+                ),
+                name=self.name,
+            )
+
         if self._group_index is None:
             self._make_codes()
         assert self._group_index is not None

From 1b04e51f21f67f2101e248355d8a71638a5532e3 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 7 May 2021 15:45:14 -0700
Subject: [PATCH 2/6] mypy fixup

---
 pandas/core/groupby/generic.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index 55e8578b2cef4..868dffb5e1b04 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -777,11 +777,7 @@ def apply_series_value_counts():
         # multi-index components
         codes = self.grouper.reconstructed_codes
         codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
-        # error: List item 0 has incompatible type "Union[ndarray, Any]";
-        # expected "Index"
-        levels = [ping.group_index for ping in self.grouper.groupings] + [
-            lev  # type: ignore[list-item]
-        ]
+        levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
         names = self.grouper.names + [self._selection_name]
 
         if dropna:

From 19c5db1cf200ff44a78bcca72f32ee6484f86af3 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 17 May 2021 15:09:41 -0700
Subject: [PATCH 3/6] REF: remove _make_codes statefulness

---
 pandas/core/groupby/grouper.py | 66 ++++++++++++++++++----------------
 pandas/core/groupby/ops.py     | 15 +++++++-
 2 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index 4aac2630feb2c..c4f7c9a7020ba 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -10,6 +10,7 @@
 import numpy as np
 
 from pandas._typing import (
+    ArrayLike,
     FrameOrSeries,
     final,
 )
@@ -590,17 +591,25 @@ def codes(self) -> np.ndarray:
         if self._passed_categorical:
             # we make a CategoricalIndex out of the cat grouper
             # preserving the categories / ordered attributes
-            cat = self.grouper
-            return cat.codes
+            return self._codes_and_uniques[0]
 
-        if self._codes is None:
-            self._make_codes()
-        # error: Incompatible return value type (got "Optional[ndarray]",
-        # expected "ndarray")
-        return self._codes  # type: ignore[return-value]
+        elif self._codes is not None:
+            # set in __init__
+            return self._codes
+        else:
+            return self._codes_and_uniques[0]
+
+    @cache_readonly
+    def group_arraylike(self) -> ArrayLike:
+        """
+        Analogous to result_index, but holding an ArrayLike to ensure
+        we can can retain ExtensionDtypes.
+        """
+        return self._codes_and_uniques[1]
 
     @cache_readonly
     def result_index(self) -> Index:
+        # TODO: what's the difference between result_index vs group_index?
         if self.all_grouper is not None:
             group_idx = self.group_index
             assert isinstance(group_idx, CategoricalIndex)
@@ -609,6 +618,14 @@ def result_index(self) -> Index:
 
     @cache_readonly
     def group_index(self) -> Index:
+        if self._group_index is not None:
+            # set in __init__
+            return self._group_index  # TODO: set this in codes_and_unique?
+        uniques = self.group_arraylike
+        return Index(uniques, name=self.name)
+
+    @cache_readonly
+    def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]:
         if self._passed_categorical:
             # we make a CategoricalIndex out of the cat grouper
             # preserving the categories / ordered attributes
@@ -616,33 +633,22 @@ def group_index(self) -> Index:
             categories = cat.categories
 
             if self.observed:
-                codes = algorithms.unique1d(cat.codes)
-                codes = codes[codes != -1]
+                ucodes = algorithms.unique1d(cat.codes)
+                ucodes = ucodes[ucodes != -1]
                 if self.sort or cat.ordered:
-                    codes = np.sort(codes)
+                    ucodes = np.sort(ucodes)
             else:
-                codes = np.arange(len(categories))
+                ucodes = np.arange(len(categories))
 
-            return CategoricalIndex(
-                Categorical.from_codes(
-                    codes=codes, categories=categories, ordered=cat.ordered
-                ),
-                name=self.name,
+            uniques = Categorical.from_codes(
+                codes=ucodes, categories=categories, ordered=cat.ordered
             )
+            return cat.codes, uniques
 
-        if self._group_index is None:
-            self._make_codes()
-        assert self._group_index is not None
-        return self._group_index
-
-    def _make_codes(self) -> None:
-        if self._codes is not None and self._group_index is not None:
-            return
-
-        # we have a list of groupers
-        if isinstance(self.grouper, ops.BaseGrouper):
+        elif isinstance(self.grouper, ops.BaseGrouper):
+            # we have a list of groupers
             codes = self.grouper.codes_info
-            uniques = self.grouper.result_index
+            uniques = self.grouper.result_arraylike
         else:
             # GH35667, replace dropna=False with na_sentinel=None
             if not self.dropna:
@@ -652,9 +658,7 @@ def _make_codes(self) -> None:
             codes, uniques = algorithms.factorize(
                 self.grouper, sort=self.sort, na_sentinel=na_sentinel
             )
-            uniques = Index(uniques, name=self.name)
-        self._codes = codes
-        self._group_index = uniques
+        return codes, uniques
 
     @cache_readonly
     def groups(self) -> dict[Hashable, np.ndarray]:
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index b37467fb8cf11..746c6e0056064 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -907,6 +907,19 @@ def reconstructed_codes(self) -> list[np.ndarray]:
         ids, obs_ids, _ = self.group_info
         return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True)
 
+    @cache_readonly
+    def result_arraylike(self) -> ArrayLike:
+        """
+        Analogous to result_index, but returning an ndarray/ExtensionArray
+        allowing us to retain ExtensionDtypes not supported by Index.
+        """
+        # TODO: once Index supports arbitrary EAs, this can be removed in favor
+        #  of result_index
+        if len(self.groupings) == 1:
+            return self.groupings[0].group_arraylike
+
+        return self.result_index._values
+
     @cache_readonly
     def result_index(self) -> Index:
         if len(self.groupings) == 1:
@@ -919,7 +932,7 @@ def result_index(self) -> Index:
         )
 
     @final
-    def get_group_levels(self) -> list[Index]:
+    def get_group_levels(self) -> list[ArrayLike]:
         # Note: only called from _insert_inaxis_grouper_inplace, which
         #  is only called for BaseGrouper, never for BinGrouper
         if len(self.groupings) == 1:

From 5df3eafb08a68eaea28c1cd85a5a682aadb5c9e0 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 17 May 2021 15:51:43 -0700
Subject: [PATCH 4/6] simplify codes

---
 pandas/core/groupby/grouper.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index c4f7c9a7020ba..895f1c96f0253 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -588,16 +588,11 @@ def indices(self):
 
     @property
     def codes(self) -> np.ndarray:
-        if self._passed_categorical:
-            # we make a CategoricalIndex out of the cat grouper
-            # preserving the categories / ordered attributes
-            return self._codes_and_uniques[0]
-
-        elif self._codes is not None:
+        if self._codes is not None:
             # set in __init__
             return self._codes
-        else:
-            return self._codes_and_uniques[0]
+
+        return self._codes_and_uniques[0]
 
     @cache_readonly
     def group_arraylike(self) -> ArrayLike:

From b1122caa003c9f204f53731ef540b4bdb8eb802a Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 17 May 2021 15:53:00 -0700
Subject: [PATCH 5/6] trim comment

---
 pandas/core/groupby/grouper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index 895f1c96f0253..370ce6d2e46d9 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -615,7 +615,7 @@ def result_index(self) -> Index:
     def group_index(self) -> Index:
         if self._group_index is not None:
             # set in __init__
-            return self._group_index  # TODO: set this in codes_and_unique?
+            return self._group_index
         uniques = self.group_arraylike
         return Index(uniques, name=self.name)
 

From 731bbe39cfb880a01ed466aaaa4328900a2a2afb Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 18 May 2021 10:27:51 -0700
Subject: [PATCH 6/6] clarify comment

---
 pandas/core/groupby/grouper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index e5216d57185af..e2855cbc90425 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -589,7 +589,7 @@ def indices(self):
     @property
     def codes(self) -> np.ndarray:
         if self._codes is not None:
-            # set in __init__
+            # _codes is set in __init__ for MultiIndex cases
             return self._codes
 
         return self._codes_and_uniques[0]
@@ -614,7 +614,7 @@ def result_index(self) -> Index:
     @cache_readonly
     def group_index(self) -> Index:
         if self._group_index is not None:
-            # set in __init__
+            # _group_index is set in __init__ for MultiIndex cases
             return self._group_index
         uniques = self.group_arraylike
         return Index(uniques, name=self.name)