From 6ce957d2707ba3b2d8bdb7d45192efae91263f08 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 25 Jun 2021 19:58:14 -0700 Subject: [PATCH 1/2] CLN: simplify MultiIndex.get_locs --- pandas/core/indexes/multi.py | 66 ++++++++++++++++++------------------ pandas/core/indexing.py | 4 +-- 2 files changed, 34 insertions(+), 36 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 39efc57052bc4..985c5617b8e29 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3035,7 +3035,9 @@ def partial_selection(key, indexer=None): indexer = self._get_level_indexer(key, level=level) return indexer, maybe_mi_droplevels(indexer, [level], drop_level) - def _get_level_indexer(self, key, level: int = 0, indexer=None): + def _get_level_indexer( + self, key, level: int = 0, indexer: Int64Index | None = None + ): # `level` kwarg is _always_ positional, never name # return an indexer, boolean array or a slice showing where the key is # in the totality of values @@ -3188,10 +3190,12 @@ def get_locs(self, seq): "MultiIndex slicing requires the index to be lexsorted: slicing " f"on levels {true_slices}, lexsort depth {self._lexsort_depth}" ) - # indexer - # this is the list of all values that we want to select + n = len(self) - indexer = None + # indexer is the list of all positions that we want to take; we + # start with it being everything and narrow it down as we look at each + # entry in `seq` + indexer = Index(np.arange(n)) def _convert_to_indexer(r) -> Int64Index: # return an indexer @@ -3209,14 +3213,10 @@ def _convert_to_indexer(r) -> Int64Index: r = r.nonzero()[0] return Int64Index(r) - def _update_indexer(idxr: Index | None, indexer: Index | None, key) -> Index: - if indexer is None: - indexer = Index(np.arange(n)) - if idxr is None: - return indexer + def _update_indexer(idxr: Index, indexer: Index) -> Index: indexer_intersection = indexer.intersection(idxr) if indexer_intersection.empty and not idxr.empty and not indexer.empty: - raise KeyError(key) + raise KeyError(seq) return indexer_intersection for i, k in enumerate(seq): @@ -3224,65 +3224,65 @@ def _update_indexer(idxr: Index | None, indexer: Index | None, key) -> Index: if com.is_bool_indexer(k): # a boolean indexer, must be the same length! k = np.asarray(k) - indexer = _update_indexer( - _convert_to_indexer(k), indexer=indexer, key=seq - ) + lvl_indexer = _convert_to_indexer(k) + indexer = _update_indexer(lvl_indexer, indexer=indexer) elif is_list_like(k): # a collection of labels to include from this level (these # are or'd) + indexers: Int64Index | None = None for x in k: try: - idxrs = _convert_to_indexer( - self._get_level_indexer(x, level=i, indexer=indexer) - ) - indexers = (idxrs if indexers is None else indexers).union( - idxrs, sort=False + item_lvl_indexer = self._get_level_indexer( + x, level=i, indexer=indexer ) except KeyError: - - # ignore not founds + # ignore not founds; see discussion in GH#39424 continue + else: + idxrs = _convert_to_indexer(item_lvl_indexer) + + if indexers is None: + indexers = idxrs + else: + indexers = indexers.union(idxrs, sort=False) if indexers is not None: - indexer = _update_indexer(indexers, indexer=indexer, key=seq) + indexer = _update_indexer(indexers, indexer=indexer) else: # no matches we are done - return np.array([], dtype=np.int64) + # test_loc_getitem_duplicates_multiindex_empty_indexer + return np.array([], dtype=np.intp) elif com.is_null_slice(k): # empty slice - indexer = _update_indexer(None, indexer=indexer, key=seq) + pass elif isinstance(k, slice): # a slice, include BOTH of the labels + lvl_indexer = self._get_level_indexer(k, level=i, indexer=indexer) indexer = _update_indexer( - _convert_to_indexer( - self._get_level_indexer(k, level=i, indexer=indexer) - ), + _convert_to_indexer(lvl_indexer), indexer=indexer, - key=seq, ) else: # a single label + lvl_indexer = self.get_loc_level(k, level=i, drop_level=False)[0] indexer = _update_indexer( - _convert_to_indexer( - self.get_loc_level(k, level=i, drop_level=False)[0] - ), + _convert_to_indexer(lvl_indexer), indexer=indexer, - key=seq, ) # empty indexer if indexer is None: - return np.array([], dtype=np.int64) + return np.array([], dtype=np.intp) assert isinstance(indexer, Int64Index), type(indexer) indexer = self._reorder_indexer(seq, indexer) - return indexer._values + return indexer._values.astype(np.intp, copy=False) # -------------------------------------------------------------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f8578d87e4cad..6e97ce95297d9 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1236,9 +1236,7 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): return {"key": key} if is_nested_tuple(key, labels): - if isinstance(self.obj, ABCSeries) and any( - isinstance(k, tuple) for k in key - ): + if self.ndim == 1 and any(isinstance(k, tuple) for k in key): # GH#35349 Raise if tuple in tuple for series raise ValueError("Too many indices") return labels.get_locs(key) From 88160a952b1b8a8a74048e4c78d7584bbf13b60e Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 27 Jun 2021 09:49:15 -0700 Subject: [PATCH 2/2] mypy fixup --- pandas/core/indexes/multi.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 985c5617b8e29..6666a08f4c76f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3234,8 +3234,10 @@ def _update_indexer(idxr: Index, indexer: Index) -> Index: indexers: Int64Index | None = None for x in k: try: + # Argument "indexer" to "_get_level_indexer" of "MultiIndex" + # has incompatible type "Index"; expected "Optional[Int64Index]" item_lvl_indexer = self._get_level_indexer( - x, level=i, indexer=indexer + x, level=i, indexer=indexer # type: ignore[arg-type] ) except KeyError: # ignore not founds; see discussion in GH#39424 @@ -3262,7 +3264,13 @@ def _update_indexer(idxr: Index, indexer: Index) -> Index: elif isinstance(k, slice): # a slice, include BOTH of the labels - lvl_indexer = self._get_level_indexer(k, level=i, indexer=indexer) + # Argument "indexer" to "_get_level_indexer" of "MultiIndex" has + # incompatible type "Index"; expected "Optional[Int64Index]" + lvl_indexer = self._get_level_indexer( + k, + level=i, + indexer=indexer, # type: ignore[arg-type] + ) indexer = _update_indexer( _convert_to_indexer(lvl_indexer), indexer=indexer,