diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8e12a8cb18b68..50837e1b3ed50 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6146,7 +6146,7 @@ def duplicated( if self.empty: return self._constructor_sliced(dtype=bool) - def f(vals): + def f(vals) -> tuple[np.ndarray, int]: labels, shape = algorithms.factorize(vals, size_hint=len(self)) return labels.astype("i8", copy=False), len(shape) @@ -6173,7 +6173,14 @@ def f(vals): vals = (col.values for name, col in self.items() if name in subset) labels, shape = map(list, zip(*map(f, vals))) - ids = get_group_index(labels, shape, sort=False, xnull=False) + ids = get_group_index( + labels, + # error: Argument 1 to "tuple" has incompatible type "List[_T]"; + # expected "Iterable[int]" + tuple(shape), # type: ignore[arg-type] + sort=False, + xnull=False, + ) result = self._constructor_sliced(duplicated_int64(ids, keep), index=self.index) return result.__finalize__(self, method="duplicated") diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 794f13bbfb6b1..3ddb5b1248060 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1611,7 +1611,7 @@ def _inferred_type_levels(self) -> list[str]: @doc(Index.duplicated) def duplicated(self, keep="first") -> np.ndarray: - shape = map(len, self.levels) + shape = tuple(len(lev) for lev in self.levels) ids = get_group_index(self.codes, shape, sort=False, xnull=False) return duplicated_int64(ids, keep) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d1e076da9293d..037fe5366255a 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -142,7 +142,7 @@ def _indexer_and_to_sort( codes = list(self.index.codes) levs = list(self.index.levels) to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] - sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]] + sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]) comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) @@ -166,7 +166,7 @@ def _make_selectors(self): # make the mask remaining_labels = self.sorted_labels[:-1] - level_sizes = [len(x) for x in new_levels] + level_sizes = tuple(len(x) for x in new_levels) comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) ngroups = len(obs_ids) @@ -353,7 +353,7 @@ def _unstack_multiple(data, clocs, fill_value=None): rcodes = [index.codes[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] - shape = [len(x) for x in clevels] + shape = tuple(len(x) for x in clevels) group_index = get_group_index(ccodes, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index f5cd390f077a6..f6c1afbde0bd9 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -18,7 +18,10 @@ lib, ) from pandas._libs.hashtable import unique_label_indices -from pandas._typing import IndexKeyFunc +from pandas._typing import ( + IndexKeyFunc, + Shape, +) from pandas.core.dtypes.common import ( ensure_int64, @@ -93,7 +96,7 @@ def get_indexer_indexer( return indexer -def get_group_index(labels, shape, sort: bool, xnull: bool): +def get_group_index(labels, shape: Shape, sort: bool, xnull: bool): """ For the particular label_list, gets the offsets into the hypothetical list representing the totally ordered cartesian product of all possible label @@ -108,7 +111,7 @@ def get_group_index(labels, shape, sort: bool, xnull: bool): ---------- labels : sequence of arrays Integers identifying levels at each location - shape : sequence of ints + shape : tuple[int, ...] Number of unique levels at each location sort : bool If the ranks of returned ids should match lexical ranks of labels @@ -134,33 +137,36 @@ def _int64_cut_off(shape) -> int: return i return len(shape) - def maybe_lift(lab, size): + def maybe_lift(lab, size) -> tuple[np.ndarray, int]: # promote nan values (assigned -1 label in lab array) # so that all output values are non-negative return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) - labels = map(ensure_int64, labels) + labels = [ensure_int64(x) for x in labels] + lshape = list(shape) if not xnull: - labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) + for i, (lab, size) in enumerate(zip(labels, shape)): + lab, size = maybe_lift(lab, size) + labels[i] = lab + lshape[i] = size labels = list(labels) - shape = list(shape) # Iteratively process all the labels in chunks sized so less # than _INT64_MAX unique int ids will be required for each chunk while True: # how many levels can be done without overflow: - nlev = _int64_cut_off(shape) + nlev = _int64_cut_off(lshape) # compute flat ids for the first `nlev` levels - stride = np.prod(shape[1:nlev], dtype="i8") + stride = np.prod(lshape[1:nlev], dtype="i8") out = stride * labels[0].astype("i8", subok=False, copy=False) for i in range(1, nlev): - if shape[i] == 0: - stride = 0 + if lshape[i] == 0: + stride = np.int64(0) else: - stride //= shape[i] + stride //= lshape[i] out += labels[i] * stride if xnull: # exclude nulls @@ -169,7 +175,7 @@ def maybe_lift(lab, size): mask |= lab == -1 out[mask] = -1 - if nlev == len(shape): # all levels done! + if nlev == len(lshape): # all levels done! break # compress what has been done so far in order to avoid overflow @@ -177,12 +183,12 @@ def maybe_lift(lab, size): comp_ids, obs_ids = compress_group_index(out, sort=sort) labels = [comp_ids] + labels[nlev:] - shape = [len(obs_ids)] + shape[nlev:] + lshape = [len(obs_ids)] + lshape[nlev:] return out -def get_compressed_ids(labels, sizes) -> tuple[np.ndarray, np.ndarray]: +def get_compressed_ids(labels, sizes: Shape) -> tuple[np.ndarray, np.ndarray]: """ Group_index is offsets into cartesian product of all possible labels. This space can be huge, so this function compresses it, by computing offsets @@ -191,7 +197,7 @@ def get_compressed_ids(labels, sizes) -> tuple[np.ndarray, np.ndarray]: Parameters ---------- labels : list of label arrays - sizes : list of size of the levels + sizes : tuple[int] of size of the levels Returns ------- @@ -252,12 +258,11 @@ def decons_obs_group_ids(comp_ids: np.ndarray, obs_ids, shape, labels, xnull: bo return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)] # TODO: unique_label_indices only used here, should take ndarray[np.intp] - i = unique_label_indices(ensure_int64(comp_ids)) - i8copy = lambda a: a.astype("i8", subok=False, copy=True) - return [i8copy(lab[i]) for lab in labels] + indexer = unique_label_indices(ensure_int64(comp_ids)) + return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels] -def indexer_from_factorized(labels, shape, compress: bool = True) -> np.ndarray: +def indexer_from_factorized(labels, shape: Shape, compress: bool = True) -> np.ndarray: # returned ndarray is np.intp ids = get_group_index(labels, shape, sort=True, xnull=False) @@ -334,7 +339,7 @@ def lexsort_indexer( shape.append(n) labels.append(codes) - return indexer_from_factorized(labels, shape) + return indexer_from_factorized(labels, tuple(shape)) def nargsort( @@ -576,7 +581,7 @@ def get_indexer_dict( """ shape = [len(x) for x in keys] - group_index = get_group_index(label_list, shape, sort=True, xnull=True) + group_index = get_group_index(label_list, tuple(shape), sort=True, xnull=True) if np.all(group_index == -1): # Short-circuit, lib.indices_fast will return the same return {}