From 0f6efa040a89b5c81260028fa621d7d09c7ea5bb Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 15 Aug 2023 14:39:29 -0700 Subject: [PATCH 1/5] REF: fix can_use_libjoin check --- pandas/core/indexes/base.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ee36a3515c4b3..8376613a4ba27 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4654,7 +4654,10 @@ def join( return self._join_non_unique(other, how=how) elif not self.is_unique or not other.is_unique: if self.is_monotonic_increasing and other.is_monotonic_increasing: - if not isinstance(self.dtype, IntervalDtype): + # Note: 2023-08-15 we *do* have tests that get here with + # Categorical, string[python] (can use libjoin) + # and Interval (cannot) + if self._can_use_libjoin: # otherwise we will fall through to _join_via_get_indexer # GH#39133 # go through object dtype for ea till engine is supported properly @@ -5048,8 +5051,8 @@ def _can_use_libjoin(self) -> bool: # values only, meaning no NA return ( isinstance(self.dtype, np.dtype) - or isinstance(self.values, BaseMaskedArray) - or isinstance(self._values, ArrowExtensionArray) + or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray)) + or self.dtype == "string[python]" ) return not isinstance(self.dtype, IntervalDtype) @@ -5172,6 +5175,7 @@ def _get_engine_target(self) -> ArrayLike: return self._values.astype(object) return vals + @final def _get_join_target(self) -> ArrayLike: """ Get the ndarray or ExtensionArray that we can pass to the join From c9a5d53474079983ca564e983197897eb237b2b8 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 15 Aug 2023 20:34:25 -0700 Subject: [PATCH 2/5] DOC: docstring for can_use_libjoin --- pandas/core/indexes/base.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8376613a4ba27..5ca1ddbf1a1f3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5041,10 +5041,16 @@ def _wrap_joined_index( name = get_op_result_name(self, other) return self._constructor._with_infer(joined, name=name, dtype=self.dtype) + @final @cache_readonly def _can_use_libjoin(self) -> bool: """ - Whether we can use the fastpaths implement in _libs.join + Whether we can use the fastpaths implemented in _libs.join. + + This is driven by whether (in monotonic increasing cases that are + guaranteed not to have NAs) we can convert to a np.ndarray without + making a copy. If we cannot, this negates the performance benefit + of using libjoin. """ if type(self) is Index: # excludes EAs, but include masks, we get here with monotonic @@ -5054,6 +5060,9 @@ def _can_use_libjoin(self) -> bool: or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray)) or self.dtype == "string[python]" ) + # For IntervalIndex, the conversion to numpy converts + # to object dtype, which negates the performance benefit of libjoin + # TODO: exclude RangeIndex and MultiIndex as these also make copies? return not isinstance(self.dtype, IntervalDtype) # -------------------------------------------------------------------- @@ -5176,7 +5185,7 @@ def _get_engine_target(self) -> ArrayLike: return vals @final - def _get_join_target(self) -> ArrayLike: + def _get_join_target(self) -> np.ndarray: """ Get the ndarray or ExtensionArray that we can pass to the join functions. @@ -5188,7 +5197,13 @@ def _get_join_target(self) -> ArrayLike: # This is only used if our array is monotonic, so no missing values # present return self._values.to_numpy() - return self._get_engine_target() + + # TODO: exclude ABCRangeIndex, ABCMultiIndex cases here as those create + # copies. + target = self._get_engine_target() + if not isinstance(target, np.ndarray): + raise ValueError("_can_use_libjoin should raise in this case.") + return target def _from_join_target(self, result: np.ndarray) -> ArrayLike: """ From f65f29c7796a4db7a1b5cd4ded3e6d55e98ea493 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 16 Aug 2023 11:34:35 -0700 Subject: [PATCH 3/5] Make can_use_libjoin checks more-correct --- pandas/core/indexes/base.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5ca1ddbf1a1f3..8ff396c1ae8c7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3354,6 +3354,7 @@ def _union(self, other: Index, sort: bool | None): and other.is_monotonic_increasing and not (self.has_duplicates and other.has_duplicates) and self._can_use_libjoin + and other._can_use_libjoin ): # Both are monotonic and at least one is unique, so can use outer join # (actually don't need either unique, but without this restriction @@ -3452,7 +3453,7 @@ def intersection(self, other, sort: bool = False): self, other = self._dti_setop_align_tzs(other, "intersection") if self.equals(other): - if self.has_duplicates: + if not self.is_unique: result = self.unique()._get_reconciled_name_object(other) else: result = self._get_reconciled_name_object(other) @@ -3507,7 +3508,9 @@ def _intersection(self, other: Index, sort: bool = False): self.is_monotonic_increasing and other.is_monotonic_increasing and self._can_use_libjoin + and other._can_use_libjoin and not isinstance(self, ABCMultiIndex) + and not isinstance(other, ABCMultiIndex) ): try: res_indexer, indexer, _ = self._inner_indexer(other) @@ -4657,7 +4660,7 @@ def join( # Note: 2023-08-15 we *do* have tests that get here with # Categorical, string[python] (can use libjoin) # and Interval (cannot) - if self._can_use_libjoin: + if self._can_use_libjoin and other._can_use_libjoin: # otherwise we will fall through to _join_via_get_indexer # GH#39133 # go through object dtype for ea till engine is supported properly @@ -4669,6 +4672,7 @@ def join( self.is_monotonic_increasing and other.is_monotonic_increasing and self._can_use_libjoin + and other._can_use_libjoin and not isinstance(self, ABCMultiIndex) and not isinstance(self.dtype, CategoricalDtype) ): @@ -4973,6 +4977,7 @@ def _join_monotonic( ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: # We only get here with matching dtypes and both monotonic increasing assert other.dtype == self.dtype + assert self._can_use_libjoin and other._can_use_libjoin if self.equals(other): # This is a convenient place for this check, but its correctness From 2719f9bc93f7d33e36a4b69278775992a400aebd Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 20 Aug 2023 07:15:24 -0700 Subject: [PATCH 4/5] mypy fixup --- pandas/core/indexes/base.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8ff396c1ae8c7..eae18962bc3ec 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -373,9 +373,6 @@ def _left_indexer_unique(self, other: Self) -> npt.NDArray[np.intp]: # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() - # can_use_libjoin assures sv and ov are ndarrays - sv = cast(np.ndarray, sv) - ov = cast(np.ndarray, ov) # similar but not identical to ov.searchsorted(sv) return libjoin.left_join_indexer_unique(sv, ov) @@ -386,9 +383,6 @@ def _left_indexer( # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() - # can_use_libjoin assures sv and ov are ndarrays - sv = cast(np.ndarray, sv) - ov = cast(np.ndarray, ov) joined_ndarray, lidx, ridx = libjoin.left_join_indexer(sv, ov) joined = self._from_join_target(joined_ndarray) return joined, lidx, ridx @@ -400,9 +394,6 @@ def _inner_indexer( # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() - # can_use_libjoin assures sv and ov are ndarrays - sv = cast(np.ndarray, sv) - ov = cast(np.ndarray, ov) joined_ndarray, lidx, ridx = libjoin.inner_join_indexer(sv, ov) joined = self._from_join_target(joined_ndarray) return joined, lidx, ridx @@ -414,9 +405,6 @@ def _outer_indexer( # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() - # can_use_libjoin assures sv and ov are ndarrays - sv = cast(np.ndarray, sv) - ov = cast(np.ndarray, ov) joined_ndarray, lidx, ridx = libjoin.outer_join_indexer(sv, ov) joined = self._from_join_target(joined_ndarray) return joined, lidx, ridx From e6cadfeabcbb4114f03f0fccb25e61da61aec044 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 20 Aug 2023 07:16:43 -0700 Subject: [PATCH 5/5] Fix exception message --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index eae18962bc3ec..96ff25a6bc423 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5195,7 +5195,7 @@ def _get_join_target(self) -> np.ndarray: # copies. target = self._get_engine_target() if not isinstance(target, np.ndarray): - raise ValueError("_can_use_libjoin should raise in this case.") + raise ValueError("_can_use_libjoin should return False.") return target def _from_join_target(self, result: np.ndarray) -> ArrayLike: