From 6f935646f13c42e3632feb7b77a75bc8f342cb0a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 9 Nov 2020 12:35:11 -0800 Subject: [PATCH 1/3] BUG: loc.__getitem__[[na_value]] with CategoricalIndex containing NAs --- pandas/core/common.py | 4 ++- pandas/core/indexes/category.py | 17 +++++++++---- pandas/tests/indexing/test_categorical.py | 8 ++---- pandas/tests/indexing/test_loc.py | 30 +++++++++++++++++++++++ pandas/tests/test_common.py | 8 ++++++ 5 files changed, 55 insertions(+), 12 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 9b6133d2f7627..d5c078b817ca0 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -113,7 +113,9 @@ def is_bool_indexer(key: Any) -> bool: if not lib.is_bool_array(key): na_msg = "Cannot mask with non-boolean array containing NA / NaN values" - if isna(key).any(): + if lib.infer_dtype(key) == "boolean" and isna(key).any(): + # Don't raise on e.g. ["A", "B", np.nan], see + # test_loc_getitem_list_of_labels_categoricalindex_with_na raise ValueError(na_msg) return False return True diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 1be979b1b899c..a2c1243eaf1ea 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -561,11 +561,18 @@ def _convert_list_indexer(self, keyarr): indexer = self.categories._convert_list_indexer(keyarr) return Index(self.codes).get_indexer_for(indexer) - indexer = self.categories.get_indexer(np.asarray(keyarr)) - if (indexer == -1).any(): - raise KeyError( - "a list-indexer must only include values that are in the categories" - ) + msg = "a list-indexer must only include values that are in the categories" + if self.hasnans: + msg += " or NA" + try: + codes = self._data._validate_setitem_value(keyarr) + except (ValueError, TypeError) as err: + if "Index data must be 1-dimensional" in str(err): + # e.g. test_setitem_ndarray_3d + raise + raise KeyError(msg) + if not self.hasnans and (codes == -1).any(): + raise KeyError(msg) return self.get_indexer(keyarr) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 9885765bf53e4..ec8ca6b225fdb 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -306,12 +306,8 @@ def test_loc_listlike(self): tm.assert_frame_equal(result, expected, check_index_type=True) # not all labels in the categories - with pytest.raises( - KeyError, - match=( - "'a list-indexer must only include values that are in the categories'" - ), - ): + msg = "a list-indexer must only include values that are in the categories" + with pytest.raises(KeyError, match=msg): self.df2.loc[["a", "d"]] def test_loc_listlike_dtypes(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 26c9e127bcc10..7c8ea14540e69 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -12,6 +12,7 @@ import pandas as pd from pandas import ( + CategoricalIndex, DataFrame, Index, MultiIndex, @@ -1559,6 +1560,35 @@ def test_loc_setitem_mask_and_label_with_datetimeindex(self): tm.assert_frame_equal(df, expected) +class TestLocListlike: + def test_loc_getitem_list_of_labels_categoricalindex_with_na(self): + # passing a list can include valid categories _or_ NA values + ci = CategoricalIndex(["A", "B", np.nan]) + ser = Series(range(3), index=ci) + + result = ser.loc[ci] + tm.assert_series_equal(result, ser) + + result = ser.loc[np.asarray(ci)] + tm.assert_series_equal(result, ser) + + result = ser.loc[list(ci)] + tm.assert_series_equal(result, ser) + + ser2 = ser[:-1] + ci2 = ci[1:] + # but if there are no NAs present, this should raise KeyError + msg = "a list-indexer must only include values that are in the categories" + with pytest.raises(KeyError, match=msg): + ser2.loc[ci2] + + with pytest.raises(KeyError, match=msg): + ser2.loc[np.asarray(ci2)] + + with pytest.raises(KeyError, match=msg): + ser2.loc[list(ci2)] + + def test_series_loc_getitem_label_list_missing_values(): # gh-11428 key = np.array( diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 81d866ba63bc0..8e1186b790e3d 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -158,3 +158,11 @@ def test_serializable(obj): # GH 35611 unpickled = tm.round_trip_pickle(obj) assert type(obj) == type(unpickled) + + +class TestIsBoolIndexer: + def test_non_bool_array_with_na(self): + # in particular, this should not raise + arr = np.array(["A", "B", np.nan]) + + assert not com.is_bool_indexer(arr) From 3181ed69a7db729cff4bc9e91075ed666d6a7e42 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 9 Nov 2020 12:39:24 -0800 Subject: [PATCH 2/3] parametrize --- pandas/tests/indexing/test_loc.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 7c8ea14540e69..954c782caf22f 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1561,32 +1561,33 @@ def test_loc_setitem_mask_and_label_with_datetimeindex(self): class TestLocListlike: - def test_loc_getitem_list_of_labels_categoricalindex_with_na(self): + @pytest.mark.parametrize("box", [lambda x: x, np.asarray, list]) + def test_loc_getitem_list_of_labels_categoricalindex_with_na(self, box): # passing a list can include valid categories _or_ NA values ci = CategoricalIndex(["A", "B", np.nan]) ser = Series(range(3), index=ci) - result = ser.loc[ci] + result = ser.loc[box(ci)] tm.assert_series_equal(result, ser) - result = ser.loc[np.asarray(ci)] + result = ser[box(ci)] tm.assert_series_equal(result, ser) - result = ser.loc[list(ci)] - tm.assert_series_equal(result, ser) + result = ser.to_frame().loc[box(ci)] + tm.assert_frame_equal(result, ser.to_frame()) ser2 = ser[:-1] ci2 = ci[1:] # but if there are no NAs present, this should raise KeyError msg = "a list-indexer must only include values that are in the categories" with pytest.raises(KeyError, match=msg): - ser2.loc[ci2] + ser2.loc[box(ci2)] with pytest.raises(KeyError, match=msg): - ser2.loc[np.asarray(ci2)] + ser2[box(ci2)] with pytest.raises(KeyError, match=msg): - ser2.loc[list(ci2)] + ser2.to_frame().loc[box(ci2)] def test_series_loc_getitem_label_list_missing_values(): From 7c5f54d506d996b48433de4c37bee92adbcc1b25 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 9 Nov 2020 19:39:48 -0800 Subject: [PATCH 3/3] whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e488ca52be8a0..77d55de5d3170 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -469,6 +469,7 @@ Indexing - Bug in :meth:`Index.where` incorrectly casting numeric values to strings (:issue:`37591`) - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` raises when numeric label was given for object :class:`Index` although label was in :class:`Index` (:issue:`26491`) - Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from :class:`MultiIndex` (:issue:`27104`) +- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a listlike indexer containing NA values (:issue:`37722`) Missing ^^^^^^^