From c1feca1e15f084ca3c8047eb71fb413f5d87f9a9 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 23 Feb 2022 14:07:02 -0800 Subject: [PATCH 1/5] TYP: groupby, sorting --- pandas/core/groupby/grouper.py | 6 +++--- pandas/core/groupby/ops.py | 2 +- pandas/core/indexes/base.py | 4 +++- pandas/core/indexes/multi.py | 4 +++- pandas/core/reshape/reshape.py | 1 - pandas/core/sorting.py | 33 ++++++++++++++++++++++----------- pandas/tests/test_sorting.py | 4 ++-- 7 files changed, 34 insertions(+), 20 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index af60192676597..bca5659fade01 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -459,7 +459,7 @@ class Grouping: * groups : dict of {group -> label_list} """ - _codes: np.ndarray | None = None + _codes: npt.NDArray[np.signedinteger] | None = None _group_index: Index | None = None _passed_categorical: bool _all_grouper: Categorical | None @@ -614,7 +614,7 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: return values._reverse_indexer() @property - def codes(self) -> np.ndarray: + def codes(self) -> npt.NDArray[np.signedinteger]: if self._codes is not None: # _codes is set in __init__ for MultiIndex cases return self._codes @@ -657,7 +657,7 @@ def group_index(self) -> Index: return Index._with_infer(uniques, name=self.name) @cache_readonly - def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: + def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index cf046d92dd6f3..cf70749b31b0c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -785,7 +785,7 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: @final @property - def codes(self) -> list[np.ndarray]: + def codes(self) -> list[npt.NDArray[np.signedinteger]]: return [ping.codes for ping in self.groupings] @property diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d7594f2483569..314b8f69375c6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2187,7 +2187,9 @@ def _drop_level_numbers(self, levnums: list[int]): verify_integrity=False, ) - def _get_grouper_for_level(self, mapper, *, level=None): + def _get_grouper_for_level( + self, mapper, *, level=None + ) -> tuple[Index, npt.NDArray[np.signedinteger] | None, Index | None]: """ Get index grouper corresponding to an index level diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index cc6c92a27e344..3ef9ad09cc29a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1481,7 +1481,9 @@ def _set_names(self, names, *, level=None, validate: bool = True): # -------------------------------------------------------------------- @doc(Index._get_grouper_for_level) - def _get_grouper_for_level(self, mapper, *, level): + def _get_grouper_for_level( + self, mapper, *, level=None + ) -> tuple[Index, npt.NDArray[np.signedinteger] | None, Index | None]: indexer = self.codes[level] level_index = self.levels[level] diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 9488e15920039..5ad9beeca8c87 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -185,7 +185,6 @@ def _make_selectors(self): self.group_index = comp_index self.mask = mask - self.unique_groups = obs_ids self.compressor = comp_index.searchsorted(np.arange(ngroups)) @cache_readonly diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 7ab53ccf7cb8d..2f04b9f10e960 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -193,7 +193,7 @@ def maybe_lift(lab, size) -> tuple[np.ndarray, int]: def get_compressed_ids( labels, sizes: Shape -) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]: +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: """ Group_index is offsets into cartesian product of all possible labels. This space can be huge, so this function compresses it, by computing offsets @@ -208,7 +208,7 @@ def get_compressed_ids( ------- np.ndarray[np.intp] comp_ids - np.ndarray[np.int64] + np.ndarray[np.intp] obs_group_ids """ ids = get_group_index(labels, sizes, sort=True, xnull=False) @@ -223,7 +223,9 @@ def is_int64_overflow_possible(shape) -> bool: return the_prod >= lib.i8max -def decons_group_index(comp_labels, shape): +def _decons_group_index( + comp_labels: npt.NDArray[np.intp], shape: Shape +) -> list[npt.NDArray[np.intp]]: # reconstruct labels if is_int64_overflow_possible(shape): # at some point group indices are factorized, @@ -232,7 +234,7 @@ def decons_group_index(comp_labels, shape): label_list = [] factor = 1 - y = 0 + y = np.array(0) x = comp_labels for i in reversed(range(len(shape))): labels = (x - y) % (factor * shape[i]) // factor @@ -244,24 +246,32 @@ def decons_group_index(comp_labels, shape): def decons_obs_group_ids( - comp_ids: npt.NDArray[np.intp], obs_ids, shape, labels, xnull: bool -): + comp_ids: npt.NDArray[np.intp], + obs_ids: npt.NDArray[np.intp], + shape: Shape, + labels: Sequence[npt.NDArray[np.signedinteger]], + xnull: bool, +) -> list[npt.NDArray[np.intp]]: """ Reconstruct labels from observed group ids. Parameters ---------- comp_ids : np.ndarray[np.intp] + obs_ids: np.ndarray[np.intp] + shape : tuple[int] + labels : Sequence[np.ndarray[np.signedinteger]] xnull : bool If nulls are excluded; i.e. -1 labels are passed through. """ if not xnull: - lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8") - shape = np.asarray(shape, dtype="i8") + lift + lift = np.fromiter(((a == -1).any() for a in labels), dtype=np.intp) + arr_shape = np.asarray(shape, dtype=np.intp) + lift + shape = tuple(arr_shape) if not is_int64_overflow_possible(shape): # obs ids are deconstructable! take the fast route! - out = decons_group_index(obs_ids, shape) + out = _decons_group_index(obs_ids, shape) return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)] indexer = unique_label_indices(comp_ids) @@ -660,7 +670,7 @@ def get_group_index_sorter( def compress_group_index( group_index: npt.NDArray[np.int64], sort: bool = True -) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: +) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.intp]]: """ Group_index is offsets into cartesian product of all possible labels. This space can be huge, so this function compresses it, by computing offsets @@ -673,11 +683,12 @@ def compress_group_index( # note, group labels come out ascending (ie, 1,2,3 etc) comp_ids, obs_group_ids = table.get_labels_groupby(group_index) + obs_group_ids = ensure_platform_int(obs_group_ids) # int64->int32 on 32bit if sort and len(obs_group_ids) > 0: obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) - return ensure_int64(comp_ids), ensure_int64(obs_group_ids) + return ensure_int64(comp_ids), ensure_platform_int(obs_group_ids) def _reorder_by_uniques( diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 37820fe31b6db..396c4d82d01fc 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -22,7 +22,7 @@ from pandas.core.algorithms import safe_sort import pandas.core.common as com from pandas.core.sorting import ( - decons_group_index, + _decons_group_index, get_group_index, is_int64_overflow_possible, lexsort_indexer, @@ -389,7 +389,7 @@ def align(df): ) def test_decons(codes_list, shape): group_index = get_group_index(codes_list, shape, sort=True, xnull=True) - codes_list2 = decons_group_index(group_index, shape) + codes_list2 = _decons_group_index(group_index, shape) for a, b in zip(codes_list, codes_list2): tm.assert_numpy_array_equal(a, b) From 4f221a0f2bceb1eaab7f9852912dea28b80a320d Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 23 Feb 2022 16:07:54 -0800 Subject: [PATCH 2/5] typ --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index cf70749b31b0c..4806bd95bf778 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -863,7 +863,7 @@ def ngroups(self) -> int: return len(self.result_index) @property - def reconstructed_codes(self) -> list[np.ndarray]: + def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: codes = self.codes ids, obs_ids, _ = self.group_info return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) From 51cb44514d4ed49a685c6470bf1dd8cd1c8d6afd Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 23 Feb 2022 16:18:12 -0800 Subject: [PATCH 3/5] debugging assertion --- pandas/core/reshape/reshape.py | 4 +++- pandas/core/sorting.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 5ad9beeca8c87..245f1b72501c4 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -387,9 +387,11 @@ def _unstack_multiple(data, clocs, fill_value=None): rnames = [index.names[i] for i in rlocs] shape = tuple(len(x) for x in clevels) - group_index = get_group_index(ccodes, shape, sort=False, xnull=False) + # TODO: why sort=False here? if flipped, could use get_compressed_ids + group_index = get_group_index(ccodes, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) + recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) if not rlocs: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 2f04b9f10e960..9662083bbd3e3 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -683,6 +683,8 @@ def compress_group_index( # note, group labels come out ascending (ie, 1,2,3 etc) comp_ids, obs_group_ids = table.get_labels_groupby(group_index) + + assert (obs_group_ids <= np.iinfo(np.int32).max).all() # debugging 32bit build obs_group_ids = ensure_platform_int(obs_group_ids) # int64->int32 on 32bit if sort and len(obs_group_ids) > 0: From c1ec2f92ba429ba74cb25ec276d996b904c8b0fe Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 23 Feb 2022 18:10:21 -0800 Subject: [PATCH 4/5] remove debugging assertion --- pandas/core/sorting.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 9662083bbd3e3..1a9de16b74ab8 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -684,7 +684,6 @@ def compress_group_index( # note, group labels come out ascending (ie, 1,2,3 etc) comp_ids, obs_group_ids = table.get_labels_groupby(group_index) - assert (obs_group_ids <= np.iinfo(np.int32).max).all() # debugging 32bit build obs_group_ids = ensure_platform_int(obs_group_ids) # int64->int32 on 32bit if sort and len(obs_group_ids) > 0: From 877761e3924a952715feb84dee36cd3c611ff85e Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 24 Feb 2022 12:54:37 -0800 Subject: [PATCH 5/5] fix 32bit build --- pandas/core/groupby/ops.py | 5 ++++- pandas/core/reshape/reshape.py | 4 +--- pandas/core/sorting.py | 10 ++++------ 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4806bd95bf778..57bd541eec1f3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -848,11 +848,14 @@ def codes_info(self) -> npt.NDArray[np.intp]: return ids @final - def _get_compressed_codes(self) -> tuple[np.ndarray, npt.NDArray[np.intp]]: + def _get_compressed_codes( + self, + ) -> tuple[npt.NDArray[np.signedinteger], npt.NDArray[np.intp]]: # The first returned ndarray may have any signed integer dtype if len(self.groupings) > 1: group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self._sort) + # FIXME: compress_group_index's second return value is int64, not intp ping = self.groupings[0] return ping.codes, np.arange(len(ping.group_index), dtype=np.intp) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 245f1b72501c4..5ad9beeca8c87 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -387,11 +387,9 @@ def _unstack_multiple(data, clocs, fill_value=None): rnames = [index.names[i] for i in rlocs] shape = tuple(len(x) for x in clevels) - - # TODO: why sort=False here? if flipped, could use get_compressed_ids group_index = get_group_index(ccodes, shape, sort=False, xnull=False) - comp_ids, obs_ids = compress_group_index(group_index, sort=False) + comp_ids, obs_ids = compress_group_index(group_index, sort=False) recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) if not rlocs: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 1a9de16b74ab8..0876c942087c5 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -193,7 +193,7 @@ def maybe_lift(lab, size) -> tuple[np.ndarray, int]: def get_compressed_ids( labels, sizes: Shape -) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]: """ Group_index is offsets into cartesian product of all possible labels. This space can be huge, so this function compresses it, by computing offsets @@ -208,7 +208,7 @@ def get_compressed_ids( ------- np.ndarray[np.intp] comp_ids - np.ndarray[np.intp] + np.ndarray[np.int64] obs_group_ids """ ids = get_group_index(labels, sizes, sort=True, xnull=False) @@ -670,7 +670,7 @@ def get_group_index_sorter( def compress_group_index( group_index: npt.NDArray[np.int64], sort: bool = True -) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.intp]]: +) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: """ Group_index is offsets into cartesian product of all possible labels. This space can be huge, so this function compresses it, by computing offsets @@ -684,12 +684,10 @@ def compress_group_index( # note, group labels come out ascending (ie, 1,2,3 etc) comp_ids, obs_group_ids = table.get_labels_groupby(group_index) - obs_group_ids = ensure_platform_int(obs_group_ids) # int64->int32 on 32bit - if sort and len(obs_group_ids) > 0: obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) - return ensure_int64(comp_ids), ensure_platform_int(obs_group_ids) + return ensure_int64(comp_ids), ensure_int64(obs_group_ids) def _reorder_by_uniques(