From c1feca1e15f084ca3c8047eb71fb413f5d87f9a9 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 23 Feb 2022 14:07:02 -0800
Subject: [PATCH 1/5] TYP: groupby, sorting

---
 pandas/core/groupby/grouper.py |  6 +++---
 pandas/core/groupby/ops.py     |  2 +-
 pandas/core/indexes/base.py    |  4 +++-
 pandas/core/indexes/multi.py   |  4 +++-
 pandas/core/reshape/reshape.py |  1 -
 pandas/core/sorting.py         | 33 ++++++++++++++++++++++-----------
 pandas/tests/test_sorting.py   |  4 ++--
 7 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index af60192676597..bca5659fade01 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -459,7 +459,7 @@ class Grouping:
       * groups : dict of {group -> label_list}
     """
 
-    _codes: np.ndarray | None = None
+    _codes: npt.NDArray[np.signedinteger] | None = None
     _group_index: Index | None = None
     _passed_categorical: bool
     _all_grouper: Categorical | None
@@ -614,7 +614,7 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
         return values._reverse_indexer()
 
     @property
-    def codes(self) -> np.ndarray:
+    def codes(self) -> npt.NDArray[np.signedinteger]:
         if self._codes is not None:
             # _codes is set in __init__ for MultiIndex cases
             return self._codes
@@ -657,7 +657,7 @@ def group_index(self) -> Index:
         return Index._with_infer(uniques, name=self.name)
 
     @cache_readonly
-    def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]:
+    def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
         if self._passed_categorical:
             # we make a CategoricalIndex out of the cat grouper
             # preserving the categories / ordered attributes
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index cf046d92dd6f3..cf70749b31b0c 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -785,7 +785,7 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
 
     @final
     @property
-    def codes(self) -> list[np.ndarray]:
+    def codes(self) -> list[npt.NDArray[np.signedinteger]]:
         return [ping.codes for ping in self.groupings]
 
     @property
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index d7594f2483569..314b8f69375c6 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -2187,7 +2187,9 @@ def _drop_level_numbers(self, levnums: list[int]):
                 verify_integrity=False,
             )
 
-    def _get_grouper_for_level(self, mapper, *, level=None):
+    def _get_grouper_for_level(
+        self, mapper, *, level=None
+    ) -> tuple[Index, npt.NDArray[np.signedinteger] | None, Index | None]:
         """
         Get index grouper corresponding to an index level
 
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index cc6c92a27e344..3ef9ad09cc29a 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -1481,7 +1481,9 @@ def _set_names(self, names, *, level=None, validate: bool = True):
     # --------------------------------------------------------------------
 
     @doc(Index._get_grouper_for_level)
-    def _get_grouper_for_level(self, mapper, *, level):
+    def _get_grouper_for_level(
+        self, mapper, *, level=None
+    ) -> tuple[Index, npt.NDArray[np.signedinteger] | None, Index | None]:
         indexer = self.codes[level]
         level_index = self.levels[level]
 
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index 9488e15920039..5ad9beeca8c87 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -185,7 +185,6 @@ def _make_selectors(self):
 
         self.group_index = comp_index
         self.mask = mask
-        self.unique_groups = obs_ids
         self.compressor = comp_index.searchsorted(np.arange(ngroups))
 
     @cache_readonly
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index 7ab53ccf7cb8d..2f04b9f10e960 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -193,7 +193,7 @@ def maybe_lift(lab, size) -> tuple[np.ndarray, int]:
 
 def get_compressed_ids(
     labels, sizes: Shape
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]:
+) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
     """
     Group_index is offsets into cartesian product of all possible labels. This
     space can be huge, so this function compresses it, by computing offsets
@@ -208,7 +208,7 @@ def get_compressed_ids(
     -------
     np.ndarray[np.intp]
         comp_ids
-    np.ndarray[np.int64]
+    np.ndarray[np.intp]
         obs_group_ids
     """
     ids = get_group_index(labels, sizes, sort=True, xnull=False)
@@ -223,7 +223,9 @@ def is_int64_overflow_possible(shape) -> bool:
     return the_prod >= lib.i8max
 
 
-def decons_group_index(comp_labels, shape):
+def _decons_group_index(
+    comp_labels: npt.NDArray[np.intp], shape: Shape
+) -> list[npt.NDArray[np.intp]]:
     # reconstruct labels
     if is_int64_overflow_possible(shape):
         # at some point group indices are factorized,
@@ -232,7 +234,7 @@ def decons_group_index(comp_labels, shape):
 
     label_list = []
     factor = 1
-    y = 0
+    y = np.array(0)
     x = comp_labels
     for i in reversed(range(len(shape))):
         labels = (x - y) % (factor * shape[i]) // factor
@@ -244,24 +246,32 @@ def decons_group_index(comp_labels, shape):
 
 
 def decons_obs_group_ids(
-    comp_ids: npt.NDArray[np.intp], obs_ids, shape, labels, xnull: bool
-):
+    comp_ids: npt.NDArray[np.intp],
+    obs_ids: npt.NDArray[np.intp],
+    shape: Shape,
+    labels: Sequence[npt.NDArray[np.signedinteger]],
+    xnull: bool,
+) -> list[npt.NDArray[np.intp]]:
     """
     Reconstruct labels from observed group ids.
 
     Parameters
     ----------
     comp_ids : np.ndarray[np.intp]
+    obs_ids: np.ndarray[np.intp]
+    shape : tuple[int]
+    labels : Sequence[np.ndarray[np.signedinteger]]
     xnull : bool
         If nulls are excluded; i.e. -1 labels are passed through.
     """
     if not xnull:
-        lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8")
-        shape = np.asarray(shape, dtype="i8") + lift
+        lift = np.fromiter(((a == -1).any() for a in labels), dtype=np.intp)
+        arr_shape = np.asarray(shape, dtype=np.intp) + lift
+        shape = tuple(arr_shape)
 
     if not is_int64_overflow_possible(shape):
         # obs ids are deconstructable! take the fast route!
-        out = decons_group_index(obs_ids, shape)
+        out = _decons_group_index(obs_ids, shape)
         return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)]
 
     indexer = unique_label_indices(comp_ids)
@@ -660,7 +670,7 @@ def get_group_index_sorter(
 
 def compress_group_index(
     group_index: npt.NDArray[np.int64], sort: bool = True
-) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:
+) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.intp]]:
     """
     Group_index is offsets into cartesian product of all possible labels. This
     space can be huge, so this function compresses it, by computing offsets
@@ -673,11 +683,12 @@ def compress_group_index(
 
     # note, group labels come out ascending (ie, 1,2,3 etc)
     comp_ids, obs_group_ids = table.get_labels_groupby(group_index)
+    obs_group_ids = ensure_platform_int(obs_group_ids)  # int64->int32 on 32bit
 
     if sort and len(obs_group_ids) > 0:
         obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)
 
-    return ensure_int64(comp_ids), ensure_int64(obs_group_ids)
+    return ensure_int64(comp_ids), ensure_platform_int(obs_group_ids)
 
 
 def _reorder_by_uniques(
diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py
index 37820fe31b6db..396c4d82d01fc 100644
--- a/pandas/tests/test_sorting.py
+++ b/pandas/tests/test_sorting.py
@@ -22,7 +22,7 @@
 from pandas.core.algorithms import safe_sort
 import pandas.core.common as com
 from pandas.core.sorting import (
-    decons_group_index,
+    _decons_group_index,
     get_group_index,
     is_int64_overflow_possible,
     lexsort_indexer,
@@ -389,7 +389,7 @@ def align(df):
 )
 def test_decons(codes_list, shape):
     group_index = get_group_index(codes_list, shape, sort=True, xnull=True)
-    codes_list2 = decons_group_index(group_index, shape)
+    codes_list2 = _decons_group_index(group_index, shape)
 
     for a, b in zip(codes_list, codes_list2):
         tm.assert_numpy_array_equal(a, b)

From 4f221a0f2bceb1eaab7f9852912dea28b80a320d Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 23 Feb 2022 16:07:54 -0800
Subject: [PATCH 2/5] typ

---
 pandas/core/groupby/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index cf70749b31b0c..4806bd95bf778 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -863,7 +863,7 @@ def ngroups(self) -> int:
         return len(self.result_index)
 
     @property
-    def reconstructed_codes(self) -> list[np.ndarray]:
+    def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
         codes = self.codes
         ids, obs_ids, _ = self.group_info
         return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True)

From 51cb44514d4ed49a685c6470bf1dd8cd1c8d6afd Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 23 Feb 2022 16:18:12 -0800
Subject: [PATCH 3/5] debugging assertion

---
 pandas/core/reshape/reshape.py | 4 +++-
 pandas/core/sorting.py         | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index 5ad9beeca8c87..245f1b72501c4 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -387,9 +387,11 @@ def _unstack_multiple(data, clocs, fill_value=None):
     rnames = [index.names[i] for i in rlocs]
 
     shape = tuple(len(x) for x in clevels)
-    group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
 
+    # TODO: why sort=False here? if flipped, could use get_compressed_ids
+    group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
     comp_ids, obs_ids = compress_group_index(group_index, sort=False)
+
     recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False)
 
     if not rlocs:
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index 2f04b9f10e960..9662083bbd3e3 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -683,6 +683,8 @@ def compress_group_index(
 
     # note, group labels come out ascending (ie, 1,2,3 etc)
     comp_ids, obs_group_ids = table.get_labels_groupby(group_index)
+
+    assert (obs_group_ids <= np.iinfo(np.int32).max).all()  # debugging 32bit build
     obs_group_ids = ensure_platform_int(obs_group_ids)  # int64->int32 on 32bit
 
     if sort and len(obs_group_ids) > 0:

From c1ec2f92ba429ba74cb25ec276d996b904c8b0fe Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 23 Feb 2022 18:10:21 -0800
Subject: [PATCH 4/5] remove debugging assertion

---
 pandas/core/sorting.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index 9662083bbd3e3..1a9de16b74ab8 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -684,7 +684,6 @@ def compress_group_index(
     # note, group labels come out ascending (ie, 1,2,3 etc)
     comp_ids, obs_group_ids = table.get_labels_groupby(group_index)
 
-    assert (obs_group_ids <= np.iinfo(np.int32).max).all()  # debugging 32bit build
     obs_group_ids = ensure_platform_int(obs_group_ids)  # int64->int32 on 32bit
 
     if sort and len(obs_group_ids) > 0:

From 877761e3924a952715feb84dee36cd3c611ff85e Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 24 Feb 2022 12:54:37 -0800
Subject: [PATCH 5/5] fix 32bit build

---
 pandas/core/groupby/ops.py     |  5 ++++-
 pandas/core/reshape/reshape.py |  4 +---
 pandas/core/sorting.py         | 10 ++++------
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 4806bd95bf778..57bd541eec1f3 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -848,11 +848,14 @@ def codes_info(self) -> npt.NDArray[np.intp]:
         return ids
 
     @final
-    def _get_compressed_codes(self) -> tuple[np.ndarray, npt.NDArray[np.intp]]:
+    def _get_compressed_codes(
+        self,
+    ) -> tuple[npt.NDArray[np.signedinteger], npt.NDArray[np.intp]]:
         # The first returned ndarray may have any signed integer dtype
         if len(self.groupings) > 1:
             group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True)
             return compress_group_index(group_index, sort=self._sort)
+            # FIXME: compress_group_index's second return value is int64, not intp
 
         ping = self.groupings[0]
         return ping.codes, np.arange(len(ping.group_index), dtype=np.intp)
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index 245f1b72501c4..5ad9beeca8c87 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -387,11 +387,9 @@ def _unstack_multiple(data, clocs, fill_value=None):
     rnames = [index.names[i] for i in rlocs]
 
     shape = tuple(len(x) for x in clevels)
-
-    # TODO: why sort=False here? if flipped, could use get_compressed_ids
     group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
-    comp_ids, obs_ids = compress_group_index(group_index, sort=False)
 
+    comp_ids, obs_ids = compress_group_index(group_index, sort=False)
     recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False)
 
     if not rlocs:
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index 1a9de16b74ab8..0876c942087c5 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -193,7 +193,7 @@ def maybe_lift(lab, size) -> tuple[np.ndarray, int]:
 
 def get_compressed_ids(
     labels, sizes: Shape
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
+) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]:
     """
     Group_index is offsets into cartesian product of all possible labels. This
     space can be huge, so this function compresses it, by computing offsets
@@ -208,7 +208,7 @@ def get_compressed_ids(
     -------
     np.ndarray[np.intp]
         comp_ids
-    np.ndarray[np.intp]
+    np.ndarray[np.int64]
         obs_group_ids
     """
     ids = get_group_index(labels, sizes, sort=True, xnull=False)
@@ -670,7 +670,7 @@ def get_group_index_sorter(
 
 def compress_group_index(
     group_index: npt.NDArray[np.int64], sort: bool = True
-) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.intp]]:
+) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:
     """
     Group_index is offsets into cartesian product of all possible labels. This
     space can be huge, so this function compresses it, by computing offsets
@@ -684,12 +684,10 @@ def compress_group_index(
     # note, group labels come out ascending (ie, 1,2,3 etc)
     comp_ids, obs_group_ids = table.get_labels_groupby(group_index)
 
-    obs_group_ids = ensure_platform_int(obs_group_ids)  # int64->int32 on 32bit
-
     if sort and len(obs_group_ids) > 0:
         obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)
 
-    return ensure_int64(comp_ids), ensure_platform_int(obs_group_ids)
+    return ensure_int64(comp_ids), ensure_int64(obs_group_ids)
 
 
 def _reorder_by_uniques(