Skip to content

REF: use can_use_libjoin more consistently, docstring for it #54664

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 21, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 31 additions & 19 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,9 +373,6 @@ def _left_indexer_unique(self, other: Self) -> npt.NDArray[np.intp]:
# Caller is responsible for ensuring other.dtype == self.dtype
sv = self._get_join_target()
ov = other._get_join_target()
# can_use_libjoin assures sv and ov are ndarrays
sv = cast(np.ndarray, sv)
ov = cast(np.ndarray, ov)
# similar but not identical to ov.searchsorted(sv)
return libjoin.left_join_indexer_unique(sv, ov)

Expand All @@ -386,9 +383,6 @@ def _left_indexer(
# Caller is responsible for ensuring other.dtype == self.dtype
sv = self._get_join_target()
ov = other._get_join_target()
# can_use_libjoin assures sv and ov are ndarrays
sv = cast(np.ndarray, sv)
ov = cast(np.ndarray, ov)
joined_ndarray, lidx, ridx = libjoin.left_join_indexer(sv, ov)
joined = self._from_join_target(joined_ndarray)
return joined, lidx, ridx
Expand All @@ -400,9 +394,6 @@ def _inner_indexer(
# Caller is responsible for ensuring other.dtype == self.dtype
sv = self._get_join_target()
ov = other._get_join_target()
# can_use_libjoin assures sv and ov are ndarrays
sv = cast(np.ndarray, sv)
ov = cast(np.ndarray, ov)
joined_ndarray, lidx, ridx = libjoin.inner_join_indexer(sv, ov)
joined = self._from_join_target(joined_ndarray)
return joined, lidx, ridx
Expand All @@ -414,9 +405,6 @@ def _outer_indexer(
# Caller is responsible for ensuring other.dtype == self.dtype
sv = self._get_join_target()
ov = other._get_join_target()
# can_use_libjoin assures sv and ov are ndarrays
sv = cast(np.ndarray, sv)
ov = cast(np.ndarray, ov)
joined_ndarray, lidx, ridx = libjoin.outer_join_indexer(sv, ov)
joined = self._from_join_target(joined_ndarray)
return joined, lidx, ridx
Expand Down Expand Up @@ -3354,6 +3342,7 @@ def _union(self, other: Index, sort: bool | None):
and other.is_monotonic_increasing
and not (self.has_duplicates and other.has_duplicates)
and self._can_use_libjoin
and other._can_use_libjoin
):
# Both are monotonic and at least one is unique, so can use outer join
# (actually don't need either unique, but without this restriction
Expand Down Expand Up @@ -3452,7 +3441,7 @@ def intersection(self, other, sort: bool = False):
self, other = self._dti_setop_align_tzs(other, "intersection")

if self.equals(other):
if self.has_duplicates:
if not self.is_unique:
result = self.unique()._get_reconciled_name_object(other)
else:
result = self._get_reconciled_name_object(other)
Expand Down Expand Up @@ -3507,7 +3496,9 @@ def _intersection(self, other: Index, sort: bool = False):
self.is_monotonic_increasing
and other.is_monotonic_increasing
and self._can_use_libjoin
and other._can_use_libjoin
and not isinstance(self, ABCMultiIndex)
and not isinstance(other, ABCMultiIndex)
):
try:
res_indexer, indexer, _ = self._inner_indexer(other)
Expand Down Expand Up @@ -4654,7 +4645,10 @@ def join(
return self._join_non_unique(other, how=how)
elif not self.is_unique or not other.is_unique:
if self.is_monotonic_increasing and other.is_monotonic_increasing:
if not isinstance(self.dtype, IntervalDtype):
# Note: 2023-08-15 we *do* have tests that get here with
# Categorical, string[python] (can use libjoin)
# and Interval (cannot)
if self._can_use_libjoin and other._can_use_libjoin:
# otherwise we will fall through to _join_via_get_indexer
# GH#39133
# go through object dtype for ea till engine is supported properly
Expand All @@ -4666,6 +4660,7 @@ def join(
self.is_monotonic_increasing
and other.is_monotonic_increasing
and self._can_use_libjoin
and other._can_use_libjoin
and not isinstance(self, ABCMultiIndex)
and not isinstance(self.dtype, CategoricalDtype)
):
Expand Down Expand Up @@ -4970,6 +4965,7 @@ def _join_monotonic(
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
# We only get here with matching dtypes and both monotonic increasing
assert other.dtype == self.dtype
assert self._can_use_libjoin and other._can_use_libjoin

if self.equals(other):
# This is a convenient place for this check, but its correctness
Expand Down Expand Up @@ -5038,19 +5034,28 @@ def _wrap_joined_index(
name = get_op_result_name(self, other)
return self._constructor._with_infer(joined, name=name, dtype=self.dtype)

@final
@cache_readonly
def _can_use_libjoin(self) -> bool:
"""
Whether we can use the fastpaths implement in _libs.join
Whether we can use the fastpaths implemented in _libs.join.

This is driven by whether (in monotonic increasing cases that are
guaranteed not to have NAs) we can convert to a np.ndarray without
making a copy. If we cannot, this negates the performance benefit
of using libjoin.
"""
if type(self) is Index:
# excludes EAs, but include masks, we get here with monotonic
# values only, meaning no NA
return (
isinstance(self.dtype, np.dtype)
or isinstance(self.values, BaseMaskedArray)
or isinstance(self._values, ArrowExtensionArray)
or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray))
or self.dtype == "string[python]"
)
# For IntervalIndex, the conversion to numpy converts
# to object dtype, which negates the performance benefit of libjoin
# TODO: exclude RangeIndex and MultiIndex as these also make copies?
return not isinstance(self.dtype, IntervalDtype)

# --------------------------------------------------------------------
Expand Down Expand Up @@ -5172,7 +5177,8 @@ def _get_engine_target(self) -> ArrayLike:
return self._values.astype(object)
return vals

def _get_join_target(self) -> ArrayLike:
@final
def _get_join_target(self) -> np.ndarray:
"""
Get the ndarray or ExtensionArray that we can pass to the join
functions.
Expand All @@ -5184,7 +5190,13 @@ def _get_join_target(self) -> ArrayLike:
# This is only used if our array is monotonic, so no missing values
# present
return self._values.to_numpy()
return self._get_engine_target()

# TODO: exclude ABCRangeIndex, ABCMultiIndex cases here as those create
# copies.
target = self._get_engine_target()
if not isinstance(target, np.ndarray):
raise ValueError("_can_use_libjoin should return False.")
return target

def _from_join_target(self, result: np.ndarray) -> ArrayLike:
"""
Expand Down