@@ -373,9 +373,6 @@ def _left_indexer_unique(self, other: Self) -> npt.NDArray[np.intp]:
373
373
# Caller is responsible for ensuring other.dtype == self.dtype
374
374
sv = self ._get_join_target ()
375
375
ov = other ._get_join_target ()
376
- # can_use_libjoin assures sv and ov are ndarrays
377
- sv = cast (np .ndarray , sv )
378
- ov = cast (np .ndarray , ov )
379
376
# similar but not identical to ov.searchsorted(sv)
380
377
return libjoin .left_join_indexer_unique (sv , ov )
381
378
@@ -386,9 +383,6 @@ def _left_indexer(
386
383
# Caller is responsible for ensuring other.dtype == self.dtype
387
384
sv = self ._get_join_target ()
388
385
ov = other ._get_join_target ()
389
- # can_use_libjoin assures sv and ov are ndarrays
390
- sv = cast (np .ndarray , sv )
391
- ov = cast (np .ndarray , ov )
392
386
joined_ndarray , lidx , ridx = libjoin .left_join_indexer (sv , ov )
393
387
joined = self ._from_join_target (joined_ndarray )
394
388
return joined , lidx , ridx
@@ -400,9 +394,6 @@ def _inner_indexer(
400
394
# Caller is responsible for ensuring other.dtype == self.dtype
401
395
sv = self ._get_join_target ()
402
396
ov = other ._get_join_target ()
403
- # can_use_libjoin assures sv and ov are ndarrays
404
- sv = cast (np .ndarray , sv )
405
- ov = cast (np .ndarray , ov )
406
397
joined_ndarray , lidx , ridx = libjoin .inner_join_indexer (sv , ov )
407
398
joined = self ._from_join_target (joined_ndarray )
408
399
return joined , lidx , ridx
@@ -414,9 +405,6 @@ def _outer_indexer(
414
405
# Caller is responsible for ensuring other.dtype == self.dtype
415
406
sv = self ._get_join_target ()
416
407
ov = other ._get_join_target ()
417
- # can_use_libjoin assures sv and ov are ndarrays
418
- sv = cast (np .ndarray , sv )
419
- ov = cast (np .ndarray , ov )
420
408
joined_ndarray , lidx , ridx = libjoin .outer_join_indexer (sv , ov )
421
409
joined = self ._from_join_target (joined_ndarray )
422
410
return joined , lidx , ridx
@@ -3354,6 +3342,7 @@ def _union(self, other: Index, sort: bool | None):
3354
3342
and other .is_monotonic_increasing
3355
3343
and not (self .has_duplicates and other .has_duplicates )
3356
3344
and self ._can_use_libjoin
3345
+ and other ._can_use_libjoin
3357
3346
):
3358
3347
# Both are monotonic and at least one is unique, so can use outer join
3359
3348
# (actually don't need either unique, but without this restriction
@@ -3452,7 +3441,7 @@ def intersection(self, other, sort: bool = False):
3452
3441
self , other = self ._dti_setop_align_tzs (other , "intersection" )
3453
3442
3454
3443
if self .equals (other ):
3455
- if self .has_duplicates :
3444
+ if not self .is_unique :
3456
3445
result = self .unique ()._get_reconciled_name_object (other )
3457
3446
else :
3458
3447
result = self ._get_reconciled_name_object (other )
@@ -3507,7 +3496,9 @@ def _intersection(self, other: Index, sort: bool = False):
3507
3496
self .is_monotonic_increasing
3508
3497
and other .is_monotonic_increasing
3509
3498
and self ._can_use_libjoin
3499
+ and other ._can_use_libjoin
3510
3500
and not isinstance (self , ABCMultiIndex )
3501
+ and not isinstance (other , ABCMultiIndex )
3511
3502
):
3512
3503
try :
3513
3504
res_indexer , indexer , _ = self ._inner_indexer (other )
@@ -4654,7 +4645,10 @@ def join(
4654
4645
return self ._join_non_unique (other , how = how )
4655
4646
elif not self .is_unique or not other .is_unique :
4656
4647
if self .is_monotonic_increasing and other .is_monotonic_increasing :
4657
- if not isinstance (self .dtype , IntervalDtype ):
4648
+ # Note: 2023-08-15 we *do* have tests that get here with
4649
+ # Categorical, string[python] (can use libjoin)
4650
+ # and Interval (cannot)
4651
+ if self ._can_use_libjoin and other ._can_use_libjoin :
4658
4652
# otherwise we will fall through to _join_via_get_indexer
4659
4653
# GH#39133
4660
4654
# go through object dtype for ea till engine is supported properly
@@ -4666,6 +4660,7 @@ def join(
4666
4660
self .is_monotonic_increasing
4667
4661
and other .is_monotonic_increasing
4668
4662
and self ._can_use_libjoin
4663
+ and other ._can_use_libjoin
4669
4664
and not isinstance (self , ABCMultiIndex )
4670
4665
and not isinstance (self .dtype , CategoricalDtype )
4671
4666
):
@@ -4970,6 +4965,7 @@ def _join_monotonic(
4970
4965
) -> tuple [Index , npt .NDArray [np .intp ] | None , npt .NDArray [np .intp ] | None ]:
4971
4966
# We only get here with matching dtypes and both monotonic increasing
4972
4967
assert other .dtype == self .dtype
4968
+ assert self ._can_use_libjoin and other ._can_use_libjoin
4973
4969
4974
4970
if self .equals (other ):
4975
4971
# This is a convenient place for this check, but its correctness
@@ -5038,19 +5034,28 @@ def _wrap_joined_index(
5038
5034
name = get_op_result_name (self , other )
5039
5035
return self ._constructor ._with_infer (joined , name = name , dtype = self .dtype )
5040
5036
5037
+ @final
5041
5038
@cache_readonly
5042
5039
def _can_use_libjoin (self ) -> bool :
5043
5040
"""
5044
- Whether we can use the fastpaths implement in _libs.join
5041
+ Whether we can use the fastpaths implemented in _libs.join.
5042
+
5043
+ This is driven by whether (in monotonic increasing cases that are
5044
+ guaranteed not to have NAs) we can convert to a np.ndarray without
5045
+ making a copy. If we cannot, this negates the performance benefit
5046
+ of using libjoin.
5045
5047
"""
5046
5048
if type (self ) is Index :
5047
5049
# excludes EAs, but include masks, we get here with monotonic
5048
5050
# values only, meaning no NA
5049
5051
return (
5050
5052
isinstance (self .dtype , np .dtype )
5051
- or isinstance (self .values , BaseMaskedArray )
5052
- or isinstance ( self ._values , ArrowExtensionArray )
5053
+ or isinstance (self ._values , ( ArrowExtensionArray , BaseMaskedArray ) )
5054
+ or self .dtype == "string[python]"
5053
5055
)
5056
+ # For IntervalIndex, the conversion to numpy converts
5057
+ # to object dtype, which negates the performance benefit of libjoin
5058
+ # TODO: exclude RangeIndex and MultiIndex as these also make copies?
5054
5059
return not isinstance (self .dtype , IntervalDtype )
5055
5060
5056
5061
# --------------------------------------------------------------------
@@ -5172,7 +5177,8 @@ def _get_engine_target(self) -> ArrayLike:
5172
5177
return self ._values .astype (object )
5173
5178
return vals
5174
5179
5175
- def _get_join_target (self ) -> ArrayLike :
5180
+ @final
5181
+ def _get_join_target (self ) -> np .ndarray :
5176
5182
"""
5177
5183
Get the ndarray or ExtensionArray that we can pass to the join
5178
5184
functions.
@@ -5184,7 +5190,13 @@ def _get_join_target(self) -> ArrayLike:
5184
5190
# This is only used if our array is monotonic, so no missing values
5185
5191
# present
5186
5192
return self ._values .to_numpy ()
5187
- return self ._get_engine_target ()
5193
+
5194
+ # TODO: exclude ABCRangeIndex, ABCMultiIndex cases here as those create
5195
+ # copies.
5196
+ target = self ._get_engine_target ()
5197
+ if not isinstance (target , np .ndarray ):
5198
+ raise ValueError ("_can_use_libjoin should return False." )
5199
+ return target
5188
5200
5189
5201
def _from_join_target (self , result : np .ndarray ) -> ArrayLike :
5190
5202
"""
0 commit comments