pandas-dev · phofl · Dec 21, 2023 · Nov 28, 2023 · Nov 28, 2023 · Nov 30, 2023
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -667,7 +667,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
         pa_type = self._pa_array.type
         other = self._box_pa(other)
 
-        if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [
+        if (
+            pa.types.is_string(pa_type)
+            or pa.types.is_large_string(pa_type)
+            or pa.types.is_binary(pa_type)
+        ) and op in [
             operator.add,
             roperator.radd,
         ]:
@@ -1417,7 +1421,10 @@ def _concat_same_type(cls, to_concat) -> Self:
         chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
         if to_concat[0].dtype == "string":
             # StringDtype has no attribute pyarrow_dtype
-            pa_dtype = pa.string()
+            if to_concat[0].dtype.storage == "pyarrow_numpy":
+                pa_dtype = pa.large_string()
+            else:
+                pa_dtype = pa.string()
         else:
             pa_dtype = to_concat[0].dtype.pyarrow_dtype
         arr = pa.chunked_array(chunks, type=pa_dtype)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -129,6 +129,9 @@ def __init__(self, values) -> None:
         super().__init__(values)
         self._dtype = StringDtype(storage=self._storage)
 
+        self._check_string_type()
+
+    def _check_string_type(self):
         if not pa.types.is_string(self._pa_array.type) and not (
             pa.types.is_dictionary(self._pa_array.type)
             and pa.types.is_string(self._pa_array.type.value_type)
@@ -577,12 +580,37 @@ class ArrowStringArrayNumpySemantics(ArrowStringArray):
     def __init__(self, values) -> None:
         _chk_pyarrow_available()
 
-        if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string(
+        if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
             values.type
         ):
-            values = pc.cast(values, pa.string())
+            values = pc.cast(values, pa.large_string())
         super().__init__(values)
 
+    def _check_string_type(self):
+        if not pa.types.is_large_string(self._pa_array.type) and not (
+            pa.types.is_dictionary(self._pa_array.type)
+            and pa.types.is_large_string(self._pa_array.type.value_type)
+        ):
+            raise ValueError(
+                "ArrowStringArray requires a PyArrow (chunked) array of string type"
+            )
+
+    @classmethod
+    def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
+        pa_scalar = super()._box_pa_scalar(value, pa_type)
+        if pa.types.is_string(pa_scalar.type) and pa_type is None:
+            pa_scalar = pc.cast(pa_scalar, pa.large_string())
+        return pa_scalar
+
+    @classmethod
+    def _box_pa_array(
+        cls, value, pa_type: pa.DataType | None = None, copy: bool = False
+    ) -> pa.Array | pa.ChunkedArray:
+        pa_array = super()._box_pa_array(value, pa_type)
+        if pa.types.is_string(pa_array.type) and pa_type is None:
+            pa_array = pc.cast(pa_array, pa.large_string())
+        return pa_array
+
     @classmethod
     def _result_converter(cls, values, na=None):
         if not isna(na):

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -478,25 +478,35 @@ def test_fillna_args(dtype, request, arrow_string_storage):
 def test_arrow_array(dtype):
     # protocol added in 0.15.0
     pa = pytest.importorskip("pyarrow")
+    import pyarrow.compute as pc
 
     data = pd.array(["a", "b", "c"], dtype=dtype)
     arr = pa.array(data)
     expected = pa.array(list(data), type=pa.string(), from_pandas=True)
     if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0:
         expected = pa.chunked_array(expected)
-
+    if dtype.storage == "pyarrow_numpy":
+        expected = pc.cast(arr, pa.large_string())
     assert arr.equals(expected)
 
 
 @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
-def test_arrow_roundtrip(dtype, string_storage2):
+def test_arrow_roundtrip(dtype, string_storage2, request):
     # roundtrip possible from arrow 1.0.0
     pa = pytest.importorskip("pyarrow")
 
+    if dtype.storage == "pyarrow_numpy" and string_storage2 == "pyarrow":
+        request.applymarker(
+            pytest.mark.xfail(reason="can't store large string in pyarrow string array")
+        )
+
     data = pd.array(["a", "b", None], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
-    assert table.field("a").type == "string"
+    if dtype.storage == "pyarrow_numpy":
+        assert table.field("a").type == "large_string"
+    else:
+        assert table.field("a").type == "string"
     with pd.option_context("string_storage", string_storage2):
         result = table.to_pandas()
     assert isinstance(result["a"].dtype, pd.StringDtype)
@@ -507,14 +517,22 @@ def test_arrow_roundtrip(dtype, string_storage2):
 
 
 @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
-def test_arrow_load_from_zero_chunks(dtype, string_storage2):
+def test_arrow_load_from_zero_chunks(dtype, string_storage2, request):
     # GH-41040
     pa = pytest.importorskip("pyarrow")
 
+    if dtype.storage == "pyarrow_numpy" and string_storage2 == "pyarrow":
+        request.applymarker(
+            pytest.mark.xfail(reason="can't store large string in pyarrow string array")
+        )
+
     data = pd.array([], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
-    assert table.field("a").type == "string"
+    if dtype.storage == "pyarrow_numpy":
+        assert table.field("a").type == "large_string"
+    else:
+        assert table.field("a").type == "string"
     # Instantiate the same table with no chunks at all
     table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
     with pd.option_context("string_storage", string_storage2):