Skip to content

Commit 0721841

Browse files
proostjreback
authored andcommitted
ENH: pd.MultiIndex.get_loc(np.nan) (#28919) (#28919)
1 parent 89bc0aa commit 0721841

File tree

4 files changed

+147
-10
lines changed

4 files changed

+147
-10
lines changed

doc/source/whatsnew/v1.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -938,6 +938,7 @@ Indexing
938938
- Bug when indexing with ``.loc`` where the index was a :class:`CategoricalIndex` with non-string categories didn't work (:issue:`17569`, :issue:`30225`)
939939
- :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`)
940940
- Bug in :meth:`Float64Index.get_loc` incorrectly raising ``TypeError`` instead of ``KeyError`` (:issue:`29189`)
941+
- :meth:`MultiIndex.get_loc` can't find missing values when input includes missing values (:issue:`19132`)
941942
- Bug in :meth:`Series.__setitem__` incorrectly assigning values with boolean indexer when the length of new data matches the number of ``True`` values and new data is not a ``Series`` or an ``np.array`` (:issue:`30567`)
942943
- Bug in indexing with a :class:`PeriodIndex` incorrectly accepting integers representing years, use e.g. ``ser.loc["2007"]`` instead of ``ser.loc[2007]`` (:issue:`30763`)
943944

pandas/core/indexes/multi.py

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2537,7 +2537,7 @@ def _partial_tup_index(self, tup, side="left"):
25372537
for k, (lab, lev, labs) in enumerate(zipped):
25382538
section = labs[start:end]
25392539

2540-
if lab not in lev:
2540+
if lab not in lev and not isna(lab):
25412541
if not lev.is_type_compatible(lib.infer_dtype([lab], skipna=False)):
25422542
raise TypeError(f"Level type mismatch: {lab}")
25432543

@@ -2547,13 +2547,38 @@ def _partial_tup_index(self, tup, side="left"):
25472547
loc -= 1
25482548
return start + section.searchsorted(loc, side=side)
25492549

2550-
idx = lev.get_loc(lab)
2550+
idx = self._get_loc_single_level_index(lev, lab)
25512551
if k < n - 1:
25522552
end = start + section.searchsorted(idx, side="right")
25532553
start = start + section.searchsorted(idx, side="left")
25542554
else:
25552555
return start + section.searchsorted(idx, side=side)
25562556

2557+
def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int:
2558+
"""
2559+
If key is NA value, location of index unify as -1.
2560+
2561+
Parameters
2562+
----------
2563+
level_index: Index
2564+
key : label
2565+
2566+
Returns
2567+
-------
2568+
loc : int
2569+
If key is NA value, loc is -1
2570+
Else, location of key in index.
2571+
2572+
See Also
2573+
--------
2574+
Index.get_loc : The get_loc method for (single-level) index.
2575+
"""
2576+
2577+
if is_scalar(key) and isna(key):
2578+
return -1
2579+
else:
2580+
return level_index.get_loc(key)
2581+
25572582
def get_loc(self, key, method=None):
25582583
"""
25592584
Get location for a label or a tuple of labels as an integer, slice or
@@ -2652,7 +2677,9 @@ def _maybe_to_slice(loc):
26522677
loc = np.arange(start, stop, dtype="int64")
26532678

26542679
for i, k in enumerate(follow_key, len(lead_key)):
2655-
mask = self.codes[i][loc] == self.levels[i].get_loc(k)
2680+
mask = self.codes[i][loc] == self._get_loc_single_level_index(
2681+
self.levels[i], k
2682+
)
26562683
if not mask.all():
26572684
loc = loc[mask]
26582685
if not len(loc):
@@ -2880,7 +2907,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):
28802907

28812908
else:
28822909

2883-
code = level_index.get_loc(key)
2910+
code = self._get_loc_single_level_index(level_index, key)
28842911

28852912
if level > 0 or self.lexsort_depth == 0:
28862913
# Desired level is not sorted
@@ -3375,14 +3402,11 @@ def isin(self, values, level=None):
33753402
return algos.isin(self.values, values)
33763403
else:
33773404
num = self._get_level_number(level)
3378-
levs = self.levels[num]
3379-
level_codes = self.codes[num]
3405+
levs = self.get_level_values(num)
33803406

3381-
sought_labels = levs.isin(values).nonzero()[0]
33823407
if levs.size == 0:
3383-
return np.zeros(len(level_codes), dtype=np.bool_)
3384-
else:
3385-
return np.lib.arraysetops.in1d(level_codes, sought_labels)
3408+
return np.zeros(len(levs), dtype=np.bool_)
3409+
return levs.isin(values)
33863410

33873411

33883412
MultiIndex._add_numeric_methods_disabled()

pandas/tests/indexes/multi/test_contains.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,27 @@ def test_isin_level_kwarg():
9898

9999
with pytest.raises(KeyError, match="'Level C not found'"):
100100
idx.isin(vals_1, level="C")
101+
102+
103+
def test_contains_with_missing_value():
104+
# issue 19132
105+
idx = MultiIndex.from_arrays([[1, np.nan, 2]])
106+
assert np.nan in idx
107+
108+
idx = MultiIndex.from_arrays([[1, 2], [np.nan, 3]])
109+
assert np.nan not in idx
110+
assert (1, np.nan) in idx
111+
112+
113+
@pytest.mark.parametrize(
114+
"labels,expected,level",
115+
[
116+
([("b", np.nan)], np.array([False, False, True]), None,),
117+
([np.nan, "a"], np.array([True, True, False]), 0),
118+
(["d", np.nan], np.array([False, True, True]), 1),
119+
],
120+
)
121+
def test_isin_multi_index_with_missing_value(labels, expected, level):
122+
# GH 19132
123+
midx = MultiIndex.from_arrays([[np.nan, "a", "b"], ["c", "d", np.nan]])
124+
tm.assert_numpy_array_equal(midx.isin(labels, level=level), expected)

pandas/tests/indexes/multi/test_indexing.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,3 +437,91 @@ def test_timestamp_multiindex_indexer():
437437
)
438438
should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, name="foo")
439439
tm.assert_series_equal(result, should_be)
440+
441+
442+
def test_get_loc_with_values_including_missing_values():
443+
# issue 19132
444+
idx = MultiIndex.from_product([[np.nan, 1]] * 2)
445+
expected = slice(0, 2, None)
446+
assert idx.get_loc(np.nan) == expected
447+
448+
idx = MultiIndex.from_arrays([[np.nan, 1, 2, np.nan]])
449+
expected = np.array([True, False, False, True])
450+
tm.assert_numpy_array_equal(idx.get_loc(np.nan), expected)
451+
452+
idx = MultiIndex.from_product([[np.nan, 1]] * 3)
453+
expected = slice(2, 4, None)
454+
assert idx.get_loc((np.nan, 1)) == expected
455+
456+
457+
@pytest.mark.parametrize(
458+
"index_arr,labels,expected",
459+
[
460+
(
461+
[[1, np.nan, 2], [3, 4, 5]],
462+
[1, np.nan, 2],
463+
np.array([-1, -1, -1], dtype=np.intp),
464+
),
465+
([[1, np.nan, 2], [3, 4, 5]], [(np.nan, 4)], np.array([1], dtype=np.intp)),
466+
([[1, 2, 3], [np.nan, 4, 5]], [(1, np.nan)], np.array([0], dtype=np.intp)),
467+
(
468+
[[1, 2, 3], [np.nan, 4, 5]],
469+
[np.nan, 4, 5],
470+
np.array([-1, -1, -1], dtype=np.intp),
471+
),
472+
],
473+
)
474+
def test_get_indexer_with_missing_value(index_arr, labels, expected):
475+
# issue 19132
476+
idx = MultiIndex.from_arrays(index_arr)
477+
result = idx.get_indexer(labels)
478+
tm.assert_numpy_array_equal(result, expected)
479+
480+
481+
@pytest.mark.parametrize(
482+
"index_arr,expected,target,algo",
483+
[
484+
([[np.nan, "a", "b"], ["c", "d", "e"]], 0, np.nan, "left"),
485+
([[np.nan, "a", "b"], ["c", "d", "e"]], 1, (np.nan, "c"), "right"),
486+
([["a", "b", "c"], ["d", np.nan, "d"]], 1, ("b", np.nan), "left"),
487+
],
488+
)
489+
def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo):
490+
# issue 19132
491+
idx = MultiIndex.from_arrays(index_arr)
492+
result = idx.get_slice_bound(target, side=algo, kind="loc")
493+
assert result == expected
494+
495+
496+
@pytest.mark.parametrize(
497+
"index_arr,expected,start_idx,end_idx",
498+
[
499+
([[np.nan, 1, 2], [3, 4, 5]], slice(0, 2, None), np.nan, 1),
500+
([[np.nan, 1, 2], [3, 4, 5]], slice(0, 3, None), np.nan, (2, 5)),
501+
([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), 3),
502+
([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), (3, 5)),
503+
],
504+
)
505+
def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_idx):
506+
# issue 19132
507+
idx = MultiIndex.from_arrays(index_arr)
508+
result = idx.slice_indexer(start=start_idx, end=end_idx)
509+
assert result == expected
510+
511+
512+
@pytest.mark.parametrize(
513+
"index_arr,expected,start_idx,end_idx",
514+
[
515+
([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, None),
516+
([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, "b"),
517+
([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, ("b", "e")),
518+
([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), None),
519+
([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), "c"),
520+
([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), ("c", "e")),
521+
],
522+
)
523+
def test_slice_locs_with_missing_value(index_arr, expected, start_idx, end_idx):
524+
# issue 19132
525+
idx = MultiIndex.from_arrays(index_arr)
526+
result = idx.slice_locs(start=start_idx, end=end_idx)
527+
assert result == expected

0 commit comments

Comments
 (0)