From 75948c5e4e88eb276f0188f1aefa6ea82e1bca83 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Jun 2021 21:54:07 -0700 Subject: [PATCH 1/6] BUG: DataFrame.at with CategoricalIndex --- pandas/core/frame.py | 29 ++++++++++++++++++----------- pandas/core/indexing.py | 5 ++--- pandas/tests/indexing/test_at.py | 14 ++++++++++++++ 3 files changed, 34 insertions(+), 14 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 91b9bdd564676..2edad9f6626bb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -158,6 +158,7 @@ from pandas.core.indexers import check_key_length from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( + CategoricalIndex, DatetimeIndex, Index, PeriodIndex, @@ -3553,6 +3554,11 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: Returns ------- scalar + + Notes + ----- + Assumes that index and columns both have ax._index_as_unique; + caller is responsible for checking. """ if takeable: series = self._ixs(col, axis=1) @@ -3561,20 +3567,21 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: series = self._get_item_cache(col) engine = self.index._engine + if isinstance(self.index, CategoricalIndex): + # Trying to use the engine fastpath may give incorrect results + # if our categories are integers that dont match our codes + col = self.columns.get_loc(col) + index = self.index.get_loc(index) + return self._get_value(index, col, takeable=True) + try: loc = engine.get_loc(index) return series._values[loc] - except KeyError: - # GH 20629 - if self.index.nlevels > 1: - # partial indexing forbidden - raise - - # we cannot handle direct indexing - # use positional - col = self.columns.get_loc(col) - index = self.index.get_loc(index) - return self._get_value(index, col, takeable=True) + except AttributeError: + # IntervalTree has no get_loc + col = self.columns.get_loc(col) + index = self.index.get_loc(index) + return self._get_value(index, col, takeable=True) def __setitem__(self, key, value): key = com.apply_if_callable(key, self) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 3707e141bc447..d62dcdba92bc7 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -916,8 +916,7 @@ def __getitem__(self, key): key = tuple(list(x) if is_iterator(x) else x for x in key) key = tuple(com.apply_if_callable(x, self.obj) for x in key) if self._is_scalar_access(key): - with suppress(KeyError, IndexError, AttributeError): - # AttributeError for IntervalTree get_value + with suppress(KeyError, IndexError): return self.obj._get_value(*key, takeable=self._takeable) return self._getitem_tuple(key) else: @@ -1004,7 +1003,7 @@ def _is_scalar_access(self, key: tuple) -> bool: # should not be considered scalar return False - if not ax.is_unique: + if not ax._index_as_unique: return False return True diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py index 77cfb94bf4629..23d2bee612243 100644 --- a/pandas/tests/indexing/test_at.py +++ b/pandas/tests/indexing/test_at.py @@ -8,6 +8,7 @@ from pandas import ( CategoricalDtype, + CategoricalIndex, DataFrame, Series, Timestamp, @@ -141,3 +142,16 @@ def test_at_getitem_mixed_index_no_fallback(self): ser.at[0] with pytest.raises(KeyError, match="^4$"): ser.at[4] + + def test_at_categorical_integers(self): + # CategoricalIndex with integer categories that don't happen to match + # the Categorical's codes + ci = CategoricalIndex([3, 4]) + + arr = np.arange(4).reshape(2, 2) + frame = DataFrame(arr, index=ci) + + for df in [frame, frame.T]: + for key in [0, 1]: + with pytest.raises(KeyError, match=str(key)): + df.at[key, key] From 9000840371c3cb4241885f4dad6a993c96e943e5 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Jun 2021 12:44:14 -0700 Subject: [PATCH 2/6] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index e2b923812a211..655038e1bb72b 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -1007,6 +1007,7 @@ Indexing - Bug in :meth:`DataFrame.loc.__getitem__` with :class:`MultiIndex` casting to float when at least one index column has float dtype and we retrieve a scalar (:issue:`41369`) - Bug in :meth:`DataFrame.loc` incorrectly matching non-Boolean index elements (:issue:`20432`) - Bug in :meth:`Series.__delitem__` with ``ExtensionDtype`` incorrectly casting to ``ndarray`` (:issue:`40386`) +- Bug in :meth:`DataFrame.at` with a :class:`CategoricalIndex` returning incorrect results when passed integer keys (:issue:`41846`) - Bug in :meth:`DataFrame.loc` returning a :class:`MultiIndex` in the wrong order if an indexer has duplicates (:issue:`40978`) - Bug in :meth:`DataFrame.__setitem__` raising a ``TypeError`` when using a ``str`` subclass as the column name with a :class:`DatetimeIndex` (:issue:`37366`) - Bug in :meth:`PeriodIndex.get_loc` failing to raise a ``KeyError`` when given a :class:`Period` with a mismatched ``freq`` (:issue:`41670`) From a450c0b866c75a2818fd5e65ff25a6caaa15ba73 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 10 Jun 2021 07:58:49 -0700 Subject: [PATCH 3/6] REF: access index._engine less in DataFrame --- pandas/core/frame.py | 32 +++++++++++++----------------- pandas/core/indexing.py | 2 +- pandas/tests/indexing/test_iloc.py | 2 +- 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2edad9f6626bb..ef291dfcc5453 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -158,7 +158,6 @@ from pandas.core.indexers import check_key_length from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( - CategoricalIndex, DatetimeIndex, Index, PeriodIndex, @@ -3557,8 +3556,9 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: Notes ----- - Assumes that index and columns both have ax._index_as_unique; - caller is responsible for checking. + Assumes that both `self.index._index_as_unique` and + `self.columns._index_as_unique`; Caller is responsible for checking. + """ if takeable: series = self._ixs(col, axis=1) @@ -3567,21 +3567,17 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: series = self._get_item_cache(col) engine = self.index._engine - if isinstance(self.index, CategoricalIndex): - # Trying to use the engine fastpath may give incorrect results - # if our categories are integers that dont match our codes - col = self.columns.get_loc(col) - index = self.index.get_loc(index) - return self._get_value(index, col, takeable=True) - - try: - loc = engine.get_loc(index) - return series._values[loc] - except AttributeError: - # IntervalTree has no get_loc - col = self.columns.get_loc(col) - index = self.index.get_loc(index) - return self._get_value(index, col, takeable=True) + if not isinstance(self.index, MultiIndex): + # CategoricalIndex: Trying to use the engine fastpath may give incorrect + # results if our categories are integers that dont match our codes + # IntervalIndex: IntervalTree has no get_loc + row = self.index.get_loc(index) + return series._values[row] + + # For MultiIndex going through engine effectively restricts us to + # same-length tuples; see test_get_set_value_no_partial_indexing + loc = engine.get_loc(index) + return series._values[loc] def __setitem__(self, key, value): key = com.apply_if_callable(key, self) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index d62dcdba92bc7..1e7ef9ef261ee 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -916,7 +916,7 @@ def __getitem__(self, key): key = tuple(list(x) if is_iterator(x) else x for x in key) key = tuple(com.apply_if_callable(x, self.obj) for x in key) if self._is_scalar_access(key): - with suppress(KeyError, IndexError): + with suppress(KeyError): return self.obj._get_value(*key, takeable=self._takeable) return self._getitem_tuple(key) else: diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index fc07c14f1e179..b04a2c86a79d7 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -596,7 +596,7 @@ def test_iloc_getitem_labelled_frame(self): assert result == exp # out-of-bounds exception - msg = "single positional indexer is out-of-bounds" + msg = "index 5 is out of bounds for axis 0 with size 4" with pytest.raises(IndexError, match=msg): df.iloc[10, 5] From 6d2d09f082377c3715be926d5cbbbb58c6f191b0 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 10 Jun 2021 09:05:24 -0700 Subject: [PATCH 4/6] REF: avoid _engine use in DataFrame --- pandas/core/frame.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ef291dfcc5453..ac6ebf4859e62 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3812,8 +3812,7 @@ def _set_value( return series = self._get_item_cache(col) - engine = self.index._engine - loc = engine.get_loc(index) + loc = self.index.get_loc(index) validate_numeric_casting(series.dtype, value) series._values[loc] = value From c69d7f4fde74965680558ca8bc8c32277bf69339 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 10 Jun 2021 18:47:24 -0700 Subject: [PATCH 5/6] CLN: simplify, suppress less --- pandas/core/indexing.py | 15 +++++---------- pandas/tests/indexing/test_indexing.py | 2 +- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 1e7ef9ef261ee..0fd2eedae2da9 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -916,8 +916,7 @@ def __getitem__(self, key): key = tuple(list(x) if is_iterator(x) else x for x in key) key = tuple(com.apply_if_callable(x, self.obj) for x in key) if self._is_scalar_access(key): - with suppress(KeyError): - return self.obj._get_value(*key, takeable=self._takeable) + return self.obj._get_value(*key, takeable=self._takeable) return self._getitem_tuple(key) else: # we by definition only have the 0th axis @@ -2187,7 +2186,7 @@ class _ScalarAccessIndexer(NDFrameIndexerBase): Access scalars quickly. """ - def _convert_key(self, key, is_setter: bool = False): + def _convert_key(self, key): raise AbstractMethodError(self) def __getitem__(self, key): @@ -2211,7 +2210,7 @@ def __setitem__(self, key, value): if not isinstance(key, tuple): key = _tuplify(self.ndim, key) - key = list(self._convert_key(key, is_setter=True)) + key = list(self._convert_key(key)) if len(key) != self.ndim: raise ValueError("Not enough indexers for scalar access (setting)!") @@ -2222,7 +2221,7 @@ def __setitem__(self, key, value): class _AtIndexer(_ScalarAccessIndexer): _takeable = False - def _convert_key(self, key, is_setter: bool = False): + def _convert_key(self, key): """ Require they keys to be the same type as the index. (so we don't fallback) @@ -2233,10 +2232,6 @@ def _convert_key(self, key, is_setter: bool = False): if self.ndim == 1 and len(key) > 1: key = (key,) - # allow arbitrary setting - if is_setter: - return list(key) - return key @property @@ -2271,7 +2266,7 @@ def __setitem__(self, key, value): class _iAtIndexer(_ScalarAccessIndexer): _takeable = True - def _convert_key(self, key, is_setter: bool = False): + def _convert_key(self, key): """ Require integer args. (and convert to label arguments) """ diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index c945bd6b95ee1..7ac24fb9c45f1 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -527,7 +527,7 @@ def test_string_slice_empty(self): with pytest.raises(KeyError, match="'2011'"): df["2011"] - with pytest.raises(KeyError, match="'2011'"): + with pytest.raises(KeyError, match="0"): df.loc["2011", 0] def test_astype_assignment(self): From 95c9ad098e887fe009a2fe3750c3d386fc5ba89a Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 16 Jun 2021 07:14:40 -0700 Subject: [PATCH 6/6] stricter exception message --- pandas/tests/indexing/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 55362badaa7d6..7243f2cddfec6 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -518,7 +518,7 @@ def test_string_slice_empty(self): with pytest.raises(KeyError, match="'2011'"): df["2011"] - with pytest.raises(KeyError, match="0"): + with pytest.raises(KeyError, match="^0$"): df.loc["2011", 0] def test_astype_assignment(self):