Skip to content

BUG: MultiIndex.difference incorrectly raising TypeError when indexes contain non-sortable entries #41915

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1035,6 +1035,7 @@ MultiIndex
- Bug in :meth:`MultiIndex.intersection` duplicating ``NaN`` in the result (:issue:`38623`)
- Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when the :class:`MultiIndex` contained ``NaN`` even when they are differently ordered (:issue:`38439`)
- Bug in :meth:`MultiIndex.intersection` always returning an empty result when intersecting with :class:`CategoricalIndex` (:issue:`38653`)
- Bug in :meth:`MultiIndex.difference` incorrectly raising ``TypeError`` when indexes contain non-sortable entries (:issue:`41915`)
- Bug in :meth:`MultiIndex.reindex` raising a ``ValueError`` when used on an empty :class:`MultiIndex` and indexing only a specific level (:issue:`41170`)
- Bug in :meth:`MultiIndex.reindex` raising ``TypeError`` when reindexing against a flat :class:`Index` (:issue:`41707`)

Expand Down
6 changes: 5 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3091,7 +3091,7 @@ def intersection(self, other, sort=False):
return this.intersection(other, sort=sort)

result = self._intersection(other, sort=sort)
return self._wrap_setop_result(other, result)
return self._wrap_intersection_result(other, result)

def _intersection(self, other: Index, sort=False):
"""
Expand All @@ -3113,6 +3113,10 @@ def _intersection(self, other: Index, sort=False):
res_values = _maybe_try_sort(res_values, sort)
return res_values

def _wrap_intersection_result(self, other, result):
# We will override for MultiIndex to handle empty results
return self._wrap_setop_result(other, result)

def _intersection_via_get_indexer(self, other: Index, sort) -> ArrayLike:
"""
Find the intersection of two Indexes using get_indexer.
Expand Down
35 changes: 4 additions & 31 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -3588,50 +3588,23 @@ def _maybe_match_names(self, other):
names.append(None)
return names

def _intersection(self, other, sort=False) -> MultiIndex:
def _wrap_intersection_result(self, other, result):
other, result_names = self._convert_can_do_setop(other)
other = other.astype(object, copy=False)

uniq_tuples = None # flag whether _inner_indexer was successful
if self.is_monotonic and other.is_monotonic:
try:
inner_tuples = self._inner_indexer(other)[0]
sort = False # inner_tuples is already sorted
except TypeError:
pass
else:
uniq_tuples = algos.unique(inner_tuples)

if uniq_tuples is None:
uniq_tuples = self._intersection_via_get_indexer(other, sort)

if sort is None:
uniq_tuples = sorted(uniq_tuples)

if len(uniq_tuples) == 0:
if len(result) == 0:
return MultiIndex(
levels=self.levels,
codes=[[]] * self.nlevels,
names=result_names,
verify_integrity=False,
)
else:
return MultiIndex.from_arrays(
zip(*uniq_tuples), sortorder=0, names=result_names
)
return MultiIndex.from_arrays(zip(*result), sortorder=0, names=result_names)

def _difference(self, other, sort) -> MultiIndex:
other, result_names = self._convert_can_do_setop(other)

this = self._get_unique_index()

indexer = this.get_indexer(other)
indexer = indexer.take((indexer != -1).nonzero()[0])

label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True)
difference = this._values.take(label_diff)
if sort is None:
difference = sorted(difference)
difference = super()._difference(other, sort)

if len(difference) == 0:
return MultiIndex(
Expand Down
7 changes: 3 additions & 4 deletions pandas/tests/indexes/multi/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,11 +216,10 @@ def test_difference_sort_incomparable():

other = MultiIndex.from_product([[3, pd.Timestamp("2000"), 4], ["c", "d"]])
# sort=None, the default
# MultiIndex.difference deviates here from other difference
# implementations in not catching the TypeError
msg = "'<' not supported between instances of 'Timestamp' and 'int'"
with pytest.raises(TypeError, match=msg):
msg = "sort order is undefined for incomparable objects"
with tm.assert_produces_warning(RuntimeWarning, match=msg):
result = idx.difference(other)
tm.assert_index_equal(result, idx)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

behavior change? needs a release note?


# sort=False
result = idx.difference(other, sort=False)
Expand Down