Skip to content

Switch arrow type for string array to large string #56220

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,11 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
pa_type = self._pa_array.type
other = self._box_pa(other)

if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [
if (
pa.types.is_string(pa_type)
or pa.types.is_large_string(pa_type)
or pa.types.is_binary(pa_type)
) and op in [
operator.add,
roperator.radd,
]:
Expand Down Expand Up @@ -1417,7 +1421,10 @@ def _concat_same_type(cls, to_concat) -> Self:
chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
if to_concat[0].dtype == "string":
# StringDtype has no attribute pyarrow_dtype
pa_dtype = pa.string()
if to_concat[0].dtype.storage == "pyarrow_numpy":
pa_dtype = pa.large_string()
else:
pa_dtype = pa.string()
else:
pa_dtype = to_concat[0].dtype.pyarrow_dtype
arr = pa.chunked_array(chunks, type=pa_dtype)
Expand Down
32 changes: 30 additions & 2 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ def __init__(self, values) -> None:
super().__init__(values)
self._dtype = StringDtype(storage=self._storage)

self._check_string_type()

def _check_string_type(self):
if not pa.types.is_string(self._pa_array.type) and not (
pa.types.is_dictionary(self._pa_array.type)
and pa.types.is_string(self._pa_array.type.value_type)
Expand Down Expand Up @@ -577,12 +580,37 @@ class ArrowStringArrayNumpySemantics(ArrowStringArray):
def __init__(self, values) -> None:
_chk_pyarrow_available()

if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string(
if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
values.type
):
values = pc.cast(values, pa.string())
values = pc.cast(values, pa.large_string())
super().__init__(values)

def _check_string_type(self):
if not pa.types.is_large_string(self._pa_array.type) and not (
pa.types.is_dictionary(self._pa_array.type)
and pa.types.is_large_string(self._pa_array.type.value_type)
):
raise ValueError(
"ArrowStringArray requires a PyArrow (chunked) array of string type"
)

@classmethod
def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
pa_scalar = super()._box_pa_scalar(value, pa_type)
if pa.types.is_string(pa_scalar.type) and pa_type is None:
pa_scalar = pc.cast(pa_scalar, pa.large_string())
return pa_scalar

@classmethod
def _box_pa_array(
cls, value, pa_type: pa.DataType | None = None, copy: bool = False
) -> pa.Array | pa.ChunkedArray:
pa_array = super()._box_pa_array(value, pa_type)
if pa.types.is_string(pa_array.type) and pa_type is None:
pa_array = pc.cast(pa_array, pa.large_string())
return pa_array

@classmethod
def _result_converter(cls, values, na=None):
if not isna(na):
Expand Down
28 changes: 23 additions & 5 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,25 +478,35 @@ def test_fillna_args(dtype, request, arrow_string_storage):
def test_arrow_array(dtype):
# protocol added in 0.15.0
pa = pytest.importorskip("pyarrow")
import pyarrow.compute as pc

data = pd.array(["a", "b", "c"], dtype=dtype)
arr = pa.array(data)
expected = pa.array(list(data), type=pa.string(), from_pandas=True)
if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0:
expected = pa.chunked_array(expected)

if dtype.storage == "pyarrow_numpy":
expected = pc.cast(arr, pa.large_string())
assert arr.equals(expected)


@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_roundtrip(dtype, string_storage2):
def test_arrow_roundtrip(dtype, string_storage2, request):
# roundtrip possible from arrow 1.0.0
pa = pytest.importorskip("pyarrow")

if dtype.storage == "pyarrow_numpy" and string_storage2 == "pyarrow":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO we should just change both to be large_string

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine by me but not sure if we should deprecate the other one first?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any behavior difference expected between string and large string? If not, I don't think this needs a deprecration. I would consider it an implementation detail / feature

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not inside of pandas, no, but I don't know what happens if you take it outside of pandas

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would also change both at the same time (officially String dtype is also still considered as experimental).

It will change your schema when you convert to Arrow, and so for sure people will have things to update, although I assume (hope) it will be mostly tests that are checking the exact type.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense. Guessing some things at the binary level (ex: pickle compatibility) might change across versions too

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll make the change later and then we can merge

I can monitor some of the low level stuff on the Dask CI

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated to switch to large strings for both

request.applymarker(
pytest.mark.xfail(reason="can't store large string in pyarrow string array")
)

data = pd.array(["a", "b", None], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == "string"
if dtype.storage == "pyarrow_numpy":
assert table.field("a").type == "large_string"
else:
assert table.field("a").type == "string"
with pd.option_context("string_storage", string_storage2):
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.StringDtype)
Expand All @@ -507,14 +517,22 @@ def test_arrow_roundtrip(dtype, string_storage2):


@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_load_from_zero_chunks(dtype, string_storage2):
def test_arrow_load_from_zero_chunks(dtype, string_storage2, request):
# GH-41040
pa = pytest.importorskip("pyarrow")

if dtype.storage == "pyarrow_numpy" and string_storage2 == "pyarrow":
request.applymarker(
pytest.mark.xfail(reason="can't store large string in pyarrow string array")
)

data = pd.array([], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == "string"
if dtype.storage == "pyarrow_numpy":
assert table.field("a").type == "large_string"
else:
assert table.field("a").type == "string"
# Instantiate the same table with no chunks at all
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
with pd.option_context("string_storage", string_storage2):
Expand Down