From 95ff273863eebd49ebb6654a86183d2be311df36 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 29 Apr 2024 22:12:42 +0200 Subject: [PATCH 1/2] BUG: astype not casting values for dictionary dtype correctly --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/arrow/array.py | 2 ++ pandas/tests/extension/test_arrow.py | 8 ++++++++ 3 files changed, 11 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a81fb584c8df9..910ef278a22a9 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -389,6 +389,7 @@ Numeric Conversion ^^^^^^^^^^ +- Bug in :meth:`DataFrame.astype` not casting ``values`` for Arrow-based dictionary dtype correctly (:issue:`58479`) - Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`) - Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1154130b9bed3..0240433cdb683 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -525,6 +525,8 @@ def _box_pa_array( if pa_type is not None and pa_array.type != pa_type: if pa.types.is_dictionary(pa_type): pa_array = pa_array.dictionary_encode() + if pa_array.type != pa_type: + pa_array = pa_array.cast(pa_type) else: try: pa_array = pa_array.cast(pa_type) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 79440b55dd5dd..a82f0c9d142e3 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3498,6 +3498,14 @@ def test_to_numpy_timestamp_to_int(): tm.assert_numpy_array_equal(result, expected) +@pytest.mark.parametrize("arrow_type", [pa.large_string(), pa.string()]) +def test_cast_dictionary_different_value_dtype(arrow_type): + df = pd.DataFrame({"a": ["x", "y"]}, dtype="string[pyarrow]") + data_type = ArrowDtype(pa.dictionary(pa.int32(), arrow_type)) + result = df.astype(dtypes={"a": data_type}) + assert result.dtypes.iloc[0] == data_type + + def test_map_numeric_na_action(): ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") result = ser.map(lambda x: 42, na_action="ignore") From 1d4134b774011de5c69c1ffbc395a39430883d1c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 29 Apr 2024 22:35:38 +0200 Subject: [PATCH 2/2] Fixup --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index a82f0c9d142e3..7d31fe6085c3a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3502,7 +3502,7 @@ def test_to_numpy_timestamp_to_int(): def test_cast_dictionary_different_value_dtype(arrow_type): df = pd.DataFrame({"a": ["x", "y"]}, dtype="string[pyarrow]") data_type = ArrowDtype(pa.dictionary(pa.int32(), arrow_type)) - result = df.astype(dtypes={"a": data_type}) + result = df.astype({"a": data_type}) assert result.dtypes.iloc[0] == data_type