From 4576b0a7f0596f15f11644af0bc5d0d7694d80e3 Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Wed, 13 Dec 2023 09:52:04 -0600 Subject: [PATCH 1/7] Raise on infinite loop --- pandas/core/indexes/base.py | 8 ++++++++ pandas/tests/indexes/test_indexing.py | 11 +++++++++++ 2 files changed, 19 insertions(+) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 18d6834e6191c..553393abb7b23 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5504,6 +5504,14 @@ def putmask(self, mask, value) -> Index: # See also: Block.coerce_to_target_dtype dtype = self._find_common_type_compat(value) + + # Prevent an infinite putmask loop GH56376 + if dtype == self.dtype: + raise AssertionError( + "Something has gone wrong, please report a bug at " + "https://github.com/pandas-dev/pandas/issues" + ) from err + return self.astype(dtype).putmask(mask, value) values = self._values.copy() diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index 1ea47f636ac9b..1a31935c7ac2b 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -26,6 +26,7 @@ from pandas import ( NA, + CategoricalDtype, DatetimeIndex, Index, IntervalIndex, @@ -303,6 +304,16 @@ def test_putmask_with_wrong_mask(self, index): index.putmask("foo", fill) +def test_putmask_infinite_loop(): + # Check that putmask won't get stuck in an infinite loop GH56376 + index = Index([1, 2, 0], dtype="int64") + dtype = CategoricalDtype(categories=np.asarray([1, 2, 3], dtype="float64")) + value = Index([1.0, np.NaN, 3.0], dtype=dtype) + + with pytest.raises(AssertionError, match="please report a bug"): + index.where([True, True, False], value) + + @pytest.mark.parametrize( "idx", [Index([1, 2, 3]), Index([0.1, 0.2, 0.3]), Index(["a", "b", "c"])] ) From 7a33ca3db8197220010040284e46720fcc767d21 Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Wed, 13 Dec 2023 10:32:20 -0600 Subject: [PATCH 2/7] Extend categorical fix to float types --- pandas/core/dtypes/cast.py | 17 +++++++++++++++++ pandas/tests/indexing/test_loc.py | 11 ++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d5144174d3c71..e58021778c2f8 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1834,6 +1834,23 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: raise LossySetitemError if tipo is not None: + if isinstance(element, ABCExtensionArray) and isinstance( + element.dtype, CategoricalDtype + ): + # GH#52927,56376 setting Categorical value into non-EA frame + # TODO: general-case for EAs? + try: + casted = element.astype(dtype) + except (ValueError, TypeError): + raise LossySetitemError + # Check for cases of either + # a) lossy overflow/rounding or + # b) semantic changes like dt64->int64 + comp = casted == element + if not comp.all(): + raise LossySetitemError + return casted + # TODO: itemsize check? if tipo.kind not in "iuf": # Anything other than float/integer we cannot hold diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index ce7dde3c4cb42..ec4a4ee87c3a9 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1677,12 +1677,17 @@ def test_loc_setitem_range_key(self, frame_or_series): expected = frame_or_series([0, 1, 10, 9, 11], index=obj.index) tm.assert_equal(obj, expected) - def test_loc_setitem_numpy_frame_categorical_value(self): + @pytest.mark.parametrize("dtype", ["int64", "float64"]) + def test_loc_setitem_numpy_frame_categorical_value(self, dtype): # GH#52927 - df = DataFrame({"a": [1, 1, 1, 1, 1], "b": ["a", "a", "a", "a", "a"]}) + df = DataFrame({"a": [1, 1, 1, 1, 1], "b": ["a", "a", "a", "a", "a"]}).astype( + {"a": dtype} + ) df.loc[1:2, "a"] = Categorical([2, 2], categories=[1, 2]) - expected = DataFrame({"a": [1, 2, 2, 1, 1], "b": ["a", "a", "a", "a", "a"]}) + expected = DataFrame( + {"a": [1, 2, 2, 1, 1], "b": ["a", "a", "a", "a", "a"]} + ).astype({"a": dtype}) tm.assert_frame_equal(df, expected) From 0a433b8b6d21bfa809e8a50166555636e422c742 Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Wed, 13 Dec 2023 10:35:18 -0600 Subject: [PATCH 3/7] whatsnew --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 8209525721b98..8ae9a680e4bdd 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -604,7 +604,7 @@ Indexing ^^^^^^^^ - Bug in :meth:`DataFrame.loc` when setting :class:`Series` with extension dtype into NumPy dtype (:issue:`55604`) - Bug in :meth:`Index.difference` not returning a unique set of values when ``other`` is empty or ``other`` is considered non-comparable (:issue:`55113`) -- Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`) +- Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`, :issue:`56376`) Missing ^^^^^^^ From fffcaf8eaf70d60deac3a94a0721c216adaf66c0 Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Wed, 13 Dec 2023 10:48:14 -0600 Subject: [PATCH 4/7] Handle categorical types in Series and Indexes --- pandas/core/dtypes/cast.py | 12 ++++++------ pandas/tests/indexes/test_indexing.py | 24 +++++++++++++++++++++++- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e58021778c2f8..c7522ed65ca4c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1780,9 +1780,9 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: return casted raise LossySetitemError - elif isinstance(element, ABCExtensionArray) and isinstance( - element.dtype, CategoricalDtype - ): + elif isinstance( + element, (ABCExtensionArray, ABCIndex, ABCSeries) + ) and isinstance(element.dtype, CategoricalDtype): # GH#52927 setting Categorical value into non-EA frame # TODO: general-case for EAs? try: @@ -1834,9 +1834,9 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: raise LossySetitemError if tipo is not None: - if isinstance(element, ABCExtensionArray) and isinstance( - element.dtype, CategoricalDtype - ): + if isinstance( + element, (ABCExtensionArray, ABCIndex, ABCSeries) + ) and isinstance(element.dtype, CategoricalDtype): # GH#52927,56376 setting Categorical value into non-EA frame # TODO: general-case for EAs? try: diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index 1a31935c7ac2b..f67f171f8d787 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -26,6 +26,7 @@ from pandas import ( NA, + Categorical, CategoricalDtype, DatetimeIndex, Index, @@ -33,9 +34,11 @@ MultiIndex, NaT, PeriodIndex, + Series, TimedeltaIndex, ) import pandas._testing as tm +from pandas._testing.asserters import assert_index_equal class TestTake: @@ -304,6 +307,25 @@ def test_putmask_with_wrong_mask(self, index): index.putmask("foo", fill) +def test_putmask_categorical(): + # Check that putmask can use categorical values in various forms GH56376 + index = Index([2, 1, 0], dtype="int64") + dtype = CategoricalDtype(categories=np.asarray([1, 2, 3], dtype="float64")) + + value = Categorical([1.0, 2.0, 3.0], dtype=dtype) + result = index.putmask([True, True, False], value) + expected = Index([1, 2, 0], dtype="int64") + assert_index_equal(result, expected) + + value = Series([1.0, 2.0, 3.0], dtype=dtype) + result = index.putmask([True, True, False], value) + assert_index_equal(result, expected) + + value = Index([1.0, 2.0, 3.0], dtype=dtype) + result = index.putmask([True, True, False], value) + assert_index_equal(result, expected) + + def test_putmask_infinite_loop(): # Check that putmask won't get stuck in an infinite loop GH56376 index = Index([1, 2, 0], dtype="int64") @@ -311,7 +333,7 @@ def test_putmask_infinite_loop(): value = Index([1.0, np.NaN, 3.0], dtype=dtype) with pytest.raises(AssertionError, match="please report a bug"): - index.where([True, True, False], value) + index.putmask([True, True, False], value) @pytest.mark.parametrize( From d307b3653d32efcc01dabc2dd3f0656006b15e38 Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Wed, 13 Dec 2023 16:42:16 -0600 Subject: [PATCH 5/7] Fix import for testing --- pandas/tests/indexes/test_indexing.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index f67f171f8d787..db471d8879229 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -38,7 +38,6 @@ TimedeltaIndex, ) import pandas._testing as tm -from pandas._testing.asserters import assert_index_equal class TestTake: @@ -315,15 +314,15 @@ def test_putmask_categorical(): value = Categorical([1.0, 2.0, 3.0], dtype=dtype) result = index.putmask([True, True, False], value) expected = Index([1, 2, 0], dtype="int64") - assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) value = Series([1.0, 2.0, 3.0], dtype=dtype) result = index.putmask([True, True, False], value) - assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) value = Index([1.0, 2.0, 3.0], dtype=dtype) result = index.putmask([True, True, False], value) - assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) def test_putmask_infinite_loop(): From 59d1056e53bae498e5980e89d5c2099025019a19 Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Wed, 13 Dec 2023 20:59:04 -0600 Subject: [PATCH 6/7] Deal with mypy type narrowing on Any? --- pandas/core/dtypes/cast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c7522ed65ca4c..aad5a44de651c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1749,6 +1749,7 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: tipo = _maybe_infer_dtype_type(element) + casted: Any # For mypy if dtype.kind in "iu": if isinstance(element, range): if _dtype_can_hold_range(element, dtype): @@ -1858,7 +1859,7 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: if not isinstance(tipo, np.dtype): # i.e. nullable IntegerDtype or FloatingDtype; # we can put this into an ndarray losslessly iff it has no NAs - if element._hasna: + if element._hasna: # type: ignore[union-attr] raise LossySetitemError return element elif tipo.itemsize > dtype.itemsize or tipo.kind != dtype.kind: From 51ef24cc48b64f07e1a45cdd1760190742a0c2d7 Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Wed, 13 Dec 2023 21:49:58 -0600 Subject: [PATCH 7/7] nan not NaN --- pandas/tests/indexes/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index db471d8879229..35705f74d325c 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -329,7 +329,7 @@ def test_putmask_infinite_loop(): # Check that putmask won't get stuck in an infinite loop GH56376 index = Index([1, 2, 0], dtype="int64") dtype = CategoricalDtype(categories=np.asarray([1, 2, 3], dtype="float64")) - value = Index([1.0, np.NaN, 3.0], dtype=dtype) + value = Index([1.0, np.nan, 3.0], dtype=dtype) with pytest.raises(AssertionError, match="please report a bug"): index.putmask([True, True, False], value)