diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f55d66ccaa5a9..ca24055e85ee4 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -291,7 +291,7 @@ Performance improvements - Performance improvement in :func:`read_stata` (:issue:`43059`) - Performance improvement in :meth:`to_datetime` with ``uint`` dtypes (:issue:`42606`) - Performance improvement in :meth:`Series.sparse.to_coo` (:issue:`42880`) -- +- Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 8b09c3771d070..c4a695acc2768 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -2,6 +2,8 @@ import numpy as np from pandas._typing import npt +from pandas import MultiIndex + class IndexEngine: over_size_threshold: bool def __init__(self, vgetter, n: int): ... @@ -59,7 +61,7 @@ class BaseMultiIndexCodesEngine: self, target: npt.NDArray[np.object_], ) -> npt.NDArray[np.intp]: ... - def _extract_level_codes(self, target: object): ... + def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ... def get_indexer_with_fill( self, target: np.ndarray, # np.ndarray[object] of tuples diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index b72be00714b6c..7aff683173855 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -622,7 +622,7 @@ cdef class BaseMultiIndexCodesEngine: in zip(self.levels, zt)] return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) - def get_indexer(self, target) -> np.ndarray: + def get_indexer(self, target: np.ndarray) -> np.ndarray: """ Returns an array giving the positions of each value of `target` in `self.values`, where -1 represents a value in `target` which does not @@ -630,15 +630,14 @@ cdef class BaseMultiIndexCodesEngine: Parameters ---------- - target : MultiIndex + target : np.ndarray Returns ------- np.ndarray[intp_t, ndim=1] of the indexer of `target` into `self.values` """ - lab_ints = self._extract_level_codes(target) - return self._base.get_indexer(self, lab_ints) + return self._base.get_indexer(self, target) def get_indexer_with_fill(self, ndarray target, ndarray values, str method, object limit) -> np.ndarray: @@ -741,10 +740,9 @@ cdef class BaseMultiIndexCodesEngine: return self._base.get_loc(self, lab_int) - def get_indexer_non_unique(self, target): + def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray: # target: MultiIndex - lab_ints = self._extract_level_codes(target) - indexer = self._base.get_indexer_non_unique(self, lab_ints) + indexer = self._base.get_indexer_non_unique(self, target) return indexer diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 12ac954f271d3..87e19ce6ef670 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3620,9 +3620,8 @@ def _get_indexer( else: tgt_values = target._get_engine_target() if target._is_multi and self._is_multi: - # error: Incompatible types in assignment (expression has type - # "Index", variable has type "ndarray[Any, Any]") - tgt_values = target # type: ignore[assignment] + tgt_values = self._engine._extract_level_codes(target) + indexer = self._engine.get_indexer(tgt_values) return ensure_platform_int(indexer) @@ -5465,7 +5464,7 @@ def get_indexer_non_unique( # self and non-Multi target tgt_values = target._get_engine_target() if self._is_multi and target._is_multi: - tgt_values = target + tgt_values = self._engine._extract_level_codes(target) indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return ensure_platform_int(indexer), ensure_platform_int(missing)