diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3249c4bb9e0ef..0a888473a1069 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -587,6 +587,7 @@ Indexing - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a listlike indexer containing NA values (:issue:`37722`) - Bug in :meth:`DataFrame.xs` ignored ``droplevel=False`` for columns (:issue:`19056`) - Bug in :meth:`DataFrame.reindex` raising ``IndexingError`` wrongly for empty :class:`DataFrame` with ``tolerance`` not None or ``method="nearest"`` (:issue:`27315`) +- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using listlike indexer that contains elements that are in the index's ``categories`` but not in the index itself failing to raise ``KeyError`` (:issue:`37901`) Missing ^^^^^^^ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index e67ece36b55a5..62c6a6fa7b513 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -576,23 +576,11 @@ def _convert_list_indexer(self, keyarr): # the categories if self.categories._defer_to_indexing: + # See tests.indexing.interval.test_interval:test_loc_getitem_frame indexer = self.categories._convert_list_indexer(keyarr) return Index(self.codes).get_indexer_for(indexer) - msg = "a list-indexer must only include values that are in the categories" - if self.hasnans: - msg += " or NA" - try: - codes = self._data._validate_setitem_value(keyarr) - except (ValueError, TypeError) as err: - if "Index data must be 1-dimensional" in str(err): - # e.g. test_setitem_ndarray_3d - raise - raise KeyError(msg) - if not self.hasnans and (codes == -1).any(): - raise KeyError(msg) - - return self.get_indexer(keyarr) + return self.get_indexer_for(keyarr) @doc(Index._maybe_cast_slice_bound) def _maybe_cast_slice_bound(self, label, side: str, kind): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e308fee5fc808..62b1554246e26 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1246,9 +1246,7 @@ def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): indexer, keyarr = ax._convert_listlike_indexer(key) # We only act on all found values: if indexer is not None and (indexer != -1).all(): - self._validate_read_indexer( - keyarr, indexer, axis, raise_missing=raise_missing - ) + # _validate_read_indexer is a no-op if no -1s, so skip return ax[indexer], indexer if ax._index_as_unique: @@ -1309,21 +1307,15 @@ def _validate_read_indexer( not_found = list(set(key) - set(ax)) raise KeyError(f"{not_found} not in index") - # we skip the warning on Categorical - # as this check is actually done (check for - # non-missing values), but a bit later in the - # code, so we want to avoid warning & then - # just raising - if not ax.is_categorical(): - not_found = key[missing_mask] - - with option_context("display.max_seq_items", 10, "display.width", 80): - raise KeyError( - "Passing list-likes to .loc or [] with any missing labels " - "is no longer supported. " - f"The following labels were missing: {not_found}. " - "See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 - ) + not_found = key[missing_mask] + + with option_context("display.max_seq_items", 10, "display.width", 80): + raise KeyError( + "Passing list-likes to .loc or [] with any missing labels " + "is no longer supported. " + f"The following labels were missing: {not_found}. " + "See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 + ) @doc(IndexingMixin.iloc) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 9b9ece68b887e..94fc3960f24c5 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -254,23 +256,38 @@ def test_slicing_doc_examples(self): ) tm.assert_frame_equal(result, expected) - def test_loc_listlike(self): - + def test_loc_getitem_listlike_labels(self): # list of labels result = self.df.loc[["c", "a"]] expected = self.df.iloc[[4, 0, 1, 5]] tm.assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.loc[["a", "b", "e"]] - exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") - expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index) - tm.assert_frame_equal(result, expected, check_index_type=True) + def test_loc_getitem_listlike_unused_category(self): + # GH#37901 a label that is in index.categories but not in index + # listlike containing an element in the categories but not in the values + msg = ( + "The following labels were missing: CategoricalIndex(['e'], " + "categories=['c', 'a', 'b', 'e'], ordered=False, name='B', " + "dtype='category')" + ) + with pytest.raises(KeyError, match=re.escape(msg)): + self.df2.loc[["a", "b", "e"]] + def test_loc_getitem_label_unused_category(self): # element in the categories but not in the values with pytest.raises(KeyError, match=r"^'e'$"): self.df2.loc["e"] - # assign is ok + def test_loc_getitem_non_category(self): + # not all labels in the categories + msg = ( + "The following labels were missing: Index(['d'], dtype='object', name='B')" + ) + with pytest.raises(KeyError, match=re.escape(msg)): + self.df2.loc[["a", "d"]] + + def test_loc_setitem_expansion_label_unused_category(self): + # assigning with a label that is in the categories but not in the index df = self.df2.copy() df.loc["e"] = 20 result = df.loc[["a", "b", "e"]] @@ -278,17 +295,6 @@ def test_loc_listlike(self): expected = DataFrame({"A": [0, 1, 5, 2, 3, 20]}, index=exp_index) tm.assert_frame_equal(result, expected) - df = self.df2.copy() - result = df.loc[["a", "b", "e"]] - exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") - expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index) - tm.assert_frame_equal(result, expected, check_index_type=True) - - # not all labels in the categories - msg = "a list-indexer must only include values that are in the categories" - with pytest.raises(KeyError, match=msg): - self.df2.loc[["a", "d"]] - def test_loc_listlike_dtypes(self): # GH 11586 @@ -309,8 +315,8 @@ def test_loc_listlike_dtypes(self): exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) - msg = "a list-indexer must only include values that are in the categories" - with pytest.raises(KeyError, match=msg): + msg = "The following labels were missing: Index(['x'], dtype='object')" + with pytest.raises(KeyError, match=re.escape(msg)): df.loc[["a", "x"]] # duplicated categories and codes @@ -332,8 +338,7 @@ def test_loc_listlike_dtypes(self): ) tm.assert_frame_equal(res, exp, check_index_type=True) - msg = "a list-indexer must only include values that are in the categories" - with pytest.raises(KeyError, match=msg): + with pytest.raises(KeyError, match=re.escape(msg)): df.loc[["a", "x"]] # contains unused category @@ -347,13 +352,6 @@ def test_loc_listlike_dtypes(self): ) tm.assert_frame_equal(res, exp, check_index_type=True) - res = df.loc[["a", "e"]] - exp = DataFrame( - {"A": [1, 3, np.nan], "B": [5, 7, np.nan]}, - index=CategoricalIndex(["a", "a", "e"], categories=list("abcde")), - ) - tm.assert_frame_equal(res, exp, check_index_type=True) - # duplicated slice res = df.loc[["a", "a", "b"]] exp = DataFrame( @@ -362,10 +360,27 @@ def test_loc_listlike_dtypes(self): ) tm.assert_frame_equal(res, exp, check_index_type=True) - msg = "a list-indexer must only include values that are in the categories" - with pytest.raises(KeyError, match=msg): + with pytest.raises(KeyError, match=re.escape(msg)): df.loc[["a", "x"]] + def test_loc_getitem_listlike_unused_category_raises_keyerro(self): + # key that is an *unused* category raises + index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde")) + df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index) + + with pytest.raises(KeyError, match="e"): + # For comparison, check the scalar behavior + df.loc["e"] + + msg = ( + "Passing list-likes to .loc or [] with any missing labels is no " + "longer supported. The following labels were missing: " + "CategoricalIndex(['e'], categories=['a', 'b', 'c', 'd', 'e'], " + "ordered=False, dtype='category'). See https" + ) + with pytest.raises(KeyError, match=re.escape(msg)): + df.loc[["a", "e"]] + def test_ix_categorical_index(self): # GH 12531 df = DataFrame(np.random.randn(3, 3), index=list("ABC"), columns=list("XYZ")) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index b45eddc3ac49c..28846bcf2f14d 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1674,7 +1674,12 @@ def test_loc_getitem_list_of_labels_categoricalindex_with_na(self, box): ser2 = ser[:-1] ci2 = ci[1:] # but if there are no NAs present, this should raise KeyError - msg = "a list-indexer must only include values that are in the categories" + msg = ( + r"Passing list-likes to .loc or \[\] with any missing labels is no " + "longer supported. The following labels were missing: " + r"(Categorical)?Index\(\[nan\], .*\). " + "See https" + ) with pytest.raises(KeyError, match=msg): ser2.loc[box(ci2)]