diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 0f669beaa036f..255c707a04a94 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -533,13 +533,13 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ +- Bug in :class:`ArrowStringArray` constructor raises ``ValueError`` with dictionary types of strings (:issue:`54074`) - Bug in :class:`DataFrame` constructor not copying :class:`Series` with extension dtype when given in dict (:issue:`53744`) - Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`) - Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`) - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) - Bug in :meth:`~arrays.ArrowExtensionArray.__iter__` and :meth:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) - Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) -- Styler ^^^^^^ diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 12f4b5486b6b9..4a70fcf6b5a93 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -118,7 +118,10 @@ def __init__(self, values) -> None: super().__init__(values) self._dtype = StringDtype(storage="pyarrow") - if not pa.types.is_string(self._pa_array.type): + if not pa.types.is_string(self._pa_array.type) and not ( + pa.types.is_dictionary(self._pa_array.type) + and pa.types.is_string(self._pa_array.type.value_type) + ): raise ValueError( "ArrowStringArray requires a PyArrow (chunked) array of string type" ) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 45098e12ccb38..0f899f4c8e876 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -69,6 +69,33 @@ def test_constructor_not_string_type_raises(array, chunked): ArrowStringArray(arr) +@pytest.mark.parametrize("chunked", [True, False]) +def test_constructor_not_string_type_value_dictionary_raises(chunked): + pa = pytest.importorskip("pyarrow") + + arr = pa.array([1, 2, 3], pa.dictionary(pa.int32(), pa.int32())) + if chunked: + arr = pa.chunked_array(arr) + + msg = re.escape( + "ArrowStringArray requires a PyArrow (chunked) array of string type" + ) + with pytest.raises(ValueError, match=msg): + ArrowStringArray(arr) + + +@pytest.mark.parametrize("chunked", [True, False]) +def test_constructor_valid_string_type_value_dictionary(chunked): + pa = pytest.importorskip("pyarrow") + + arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.utf8())) + if chunked: + arr = pa.chunked_array(arr) + + arr = ArrowStringArray(arr) + assert pa.types.is_string(arr._pa_array.type.value_type) + + @skip_if_no_pyarrow def test_from_sequence_wrong_dtype_raises(): with pd.option_context("string_storage", "python"):