Skip to content

Commit 191f859

Browse files
authored
BUG: loc.__getitem__[[na_value]] with CategoricalIndex containing NAs (#37722)
1 parent feb29c7 commit 191f859

File tree

6 files changed

+57
-12
lines changed

6 files changed

+57
-12
lines changed

doc/source/whatsnew/v1.2.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,7 @@ Indexing
474474
- Bug in :meth:`Index.where` incorrectly casting numeric values to strings (:issue:`37591`)
475475
- Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` raises when numeric label was given for object :class:`Index` although label was in :class:`Index` (:issue:`26491`)
476476
- Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from :class:`MultiIndex` (:issue:`27104`)
477+
- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a listlike indexer containing NA values (:issue:`37722`)
477478

478479
Missing
479480
^^^^^^^

pandas/core/common.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,9 @@ def is_bool_indexer(key: Any) -> bool:
113113

114114
if not lib.is_bool_array(key):
115115
na_msg = "Cannot mask with non-boolean array containing NA / NaN values"
116-
if isna(key).any():
116+
if lib.infer_dtype(key) == "boolean" and isna(key).any():
117+
# Don't raise on e.g. ["A", "B", np.nan], see
118+
# test_loc_getitem_list_of_labels_categoricalindex_with_na
117119
raise ValueError(na_msg)
118120
return False
119121
return True

pandas/core/indexes/category.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -586,11 +586,18 @@ def _convert_list_indexer(self, keyarr):
586586
indexer = self.categories._convert_list_indexer(keyarr)
587587
return Index(self.codes).get_indexer_for(indexer)
588588

589-
indexer = self.categories.get_indexer(np.asarray(keyarr))
590-
if (indexer == -1).any():
591-
raise KeyError(
592-
"a list-indexer must only include values that are in the categories"
593-
)
589+
msg = "a list-indexer must only include values that are in the categories"
590+
if self.hasnans:
591+
msg += " or NA"
592+
try:
593+
codes = self._data._validate_setitem_value(keyarr)
594+
except (ValueError, TypeError) as err:
595+
if "Index data must be 1-dimensional" in str(err):
596+
# e.g. test_setitem_ndarray_3d
597+
raise
598+
raise KeyError(msg)
599+
if not self.hasnans and (codes == -1).any():
600+
raise KeyError(msg)
594601

595602
return self.get_indexer(keyarr)
596603

pandas/tests/indexing/test_categorical.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -285,12 +285,8 @@ def test_loc_listlike(self):
285285
tm.assert_frame_equal(result, expected, check_index_type=True)
286286

287287
# not all labels in the categories
288-
with pytest.raises(
289-
KeyError,
290-
match=(
291-
"'a list-indexer must only include values that are in the categories'"
292-
),
293-
):
288+
msg = "a list-indexer must only include values that are in the categories"
289+
with pytest.raises(KeyError, match=msg):
294290
self.df2.loc[["a", "d"]]
295291

296292
def test_loc_listlike_dtypes(self):

pandas/tests/indexing/test_loc.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import pandas as pd
1414
from pandas import (
15+
CategoricalIndex,
1516
DataFrame,
1617
Index,
1718
MultiIndex,
@@ -1645,6 +1646,36 @@ def test_loc_setitem_mask_td64_series_value(self):
16451646
tm.assert_frame_equal(df, df_copy)
16461647

16471648

1649+
class TestLocListlike:
1650+
@pytest.mark.parametrize("box", [lambda x: x, np.asarray, list])
1651+
def test_loc_getitem_list_of_labels_categoricalindex_with_na(self, box):
1652+
# passing a list can include valid categories _or_ NA values
1653+
ci = CategoricalIndex(["A", "B", np.nan])
1654+
ser = Series(range(3), index=ci)
1655+
1656+
result = ser.loc[box(ci)]
1657+
tm.assert_series_equal(result, ser)
1658+
1659+
result = ser[box(ci)]
1660+
tm.assert_series_equal(result, ser)
1661+
1662+
result = ser.to_frame().loc[box(ci)]
1663+
tm.assert_frame_equal(result, ser.to_frame())
1664+
1665+
ser2 = ser[:-1]
1666+
ci2 = ci[1:]
1667+
# but if there are no NAs present, this should raise KeyError
1668+
msg = "a list-indexer must only include values that are in the categories"
1669+
with pytest.raises(KeyError, match=msg):
1670+
ser2.loc[box(ci2)]
1671+
1672+
with pytest.raises(KeyError, match=msg):
1673+
ser2[box(ci2)]
1674+
1675+
with pytest.raises(KeyError, match=msg):
1676+
ser2.to_frame().loc[box(ci2)]
1677+
1678+
16481679
def test_series_loc_getitem_label_list_missing_values():
16491680
# gh-11428
16501681
key = np.array(

pandas/tests/test_common.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,11 @@ def test_serializable(obj):
158158
# GH 35611
159159
unpickled = tm.round_trip_pickle(obj)
160160
assert type(obj) == type(unpickled)
161+
162+
163+
class TestIsBoolIndexer:
164+
def test_non_bool_array_with_na(self):
165+
# in particular, this should not raise
166+
arr = np.array(["A", "B", np.nan])
167+
168+
assert not com.is_bool_indexer(arr)

0 commit comments

Comments
 (0)