diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e488ca52be8a0..77d55de5d3170 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -469,6 +469,7 @@ Indexing - Bug in :meth:`Index.where` incorrectly casting numeric values to strings (:issue:`37591`) - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` raises when numeric label was given for object :class:`Index` although label was in :class:`Index` (:issue:`26491`) - Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from :class:`MultiIndex` (:issue:`27104`) +- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a listlike indexer containing NA values (:issue:`37722`) Missing ^^^^^^^ diff --git a/pandas/core/common.py b/pandas/core/common.py index 9b6133d2f7627..d5c078b817ca0 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -113,7 +113,9 @@ def is_bool_indexer(key: Any) -> bool: if not lib.is_bool_array(key): na_msg = "Cannot mask with non-boolean array containing NA / NaN values" - if isna(key).any(): + if lib.infer_dtype(key) == "boolean" and isna(key).any(): + # Don't raise on e.g. ["A", "B", np.nan], see + # test_loc_getitem_list_of_labels_categoricalindex_with_na raise ValueError(na_msg) return False return True diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 24bd60a7356dd..06df8f85cded7 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -586,11 +586,18 @@ def _convert_list_indexer(self, keyarr): indexer = self.categories._convert_list_indexer(keyarr) return Index(self.codes).get_indexer_for(indexer) - indexer = self.categories.get_indexer(np.asarray(keyarr)) - if (indexer == -1).any(): - raise KeyError( - "a list-indexer must only include values that are in the categories" - ) + msg = "a list-indexer must only include values that are in the categories" + if self.hasnans: + msg += " or NA" + try: + codes = self._data._validate_setitem_value(keyarr) + except (ValueError, TypeError) as err: + if "Index data must be 1-dimensional" in str(err): + # e.g. test_setitem_ndarray_3d + raise + raise KeyError(msg) + if not self.hasnans and (codes == -1).any(): + raise KeyError(msg) return self.get_indexer(keyarr) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 20d7662855ab3..fdbdbb5f0c710 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -285,12 +285,8 @@ def test_loc_listlike(self): tm.assert_frame_equal(result, expected, check_index_type=True) # not all labels in the categories - with pytest.raises( - KeyError, - match=( - "'a list-indexer must only include values that are in the categories'" - ), - ): + msg = "a list-indexer must only include values that are in the categories" + with pytest.raises(KeyError, match=msg): self.df2.loc[["a", "d"]] def test_loc_listlike_dtypes(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 0d40b5f38e48a..9aab867df4b17 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -12,6 +12,7 @@ import pandas as pd from pandas import ( + CategoricalIndex, DataFrame, Index, MultiIndex, @@ -1645,6 +1646,36 @@ def test_loc_setitem_mask_td64_series_value(self): tm.assert_frame_equal(df, df_copy) +class TestLocListlike: + @pytest.mark.parametrize("box", [lambda x: x, np.asarray, list]) + def test_loc_getitem_list_of_labels_categoricalindex_with_na(self, box): + # passing a list can include valid categories _or_ NA values + ci = CategoricalIndex(["A", "B", np.nan]) + ser = Series(range(3), index=ci) + + result = ser.loc[box(ci)] + tm.assert_series_equal(result, ser) + + result = ser[box(ci)] + tm.assert_series_equal(result, ser) + + result = ser.to_frame().loc[box(ci)] + tm.assert_frame_equal(result, ser.to_frame()) + + ser2 = ser[:-1] + ci2 = ci[1:] + # but if there are no NAs present, this should raise KeyError + msg = "a list-indexer must only include values that are in the categories" + with pytest.raises(KeyError, match=msg): + ser2.loc[box(ci2)] + + with pytest.raises(KeyError, match=msg): + ser2[box(ci2)] + + with pytest.raises(KeyError, match=msg): + ser2.to_frame().loc[box(ci2)] + + def test_series_loc_getitem_label_list_missing_values(): # gh-11428 key = np.array( diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 81d866ba63bc0..8e1186b790e3d 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -158,3 +158,11 @@ def test_serializable(obj): # GH 35611 unpickled = tm.round_trip_pickle(obj) assert type(obj) == type(unpickled) + + +class TestIsBoolIndexer: + def test_non_bool_array_with_na(self): + # in particular, this should not raise + arr = np.array(["A", "B", np.nan]) + + assert not com.is_bool_indexer(arr)