From 9023cb0e746df5783c958ce225bd5a4474b59a66 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 5 Jun 2023 21:42:53 -0400 Subject: [PATCH 1/4] BUG: Series.str.split(expand=True) for ArrowDtype(pa.string()) --- pandas/core/strings/accessor.py | 4 ++++ pandas/tests/extension/test_arrow.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 08c2736bf9816..9acaf660bf841 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -287,6 +287,10 @@ def _wrap_result( labels = name else: labels = range(max_len) + # append nulls to each scalar list element up to max_len + result._pa_array = pa.compute.list_slice( + result._pa_array, start=0, stop=max_len, return_fixed_size_list=True + ) result = { label: ArrowExtensionArray(pa.array(res)) for label, res in zip(labels, (zip(*result.tolist()))) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index a0d15c70b5720..8625500a83e79 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2286,6 +2286,15 @@ def test_str_split(): ) tm.assert_frame_equal(result, expected) + result = ser.str.split("1", expand=True) + expected = pd.DataFrame( + { + 0: ArrowExtensionArray(pa.array(["a", "a2cbcb", None])), + 1: ArrowExtensionArray(pa.array(["cbcb", None, None])), + } + ) + tm.assert_frame_equal(result, expected) + def test_str_rsplit(): # GH 52401 @@ -2311,6 +2320,15 @@ def test_str_rsplit(): ) tm.assert_frame_equal(result, expected) + result = ser.str.rsplit("1", expand=True) + expected = pd.DataFrame( + { + 0: ArrowExtensionArray(pa.array(["a", "a2cbcb", None])), + 1: ArrowExtensionArray(pa.array(["cbcb", None, None])), + } + ) + tm.assert_frame_equal(result, expected) + def test_str_unsupported_extract(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) From ad8fd76a47265d217717dd74405da1c9181487d6 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 5 Jun 2023 22:00:10 -0400 Subject: [PATCH 2/4] whatsnew --- doc/source/whatsnew/v2.0.3.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v2.0.3.rst b/doc/source/whatsnew/v2.0.3.rst index 2c63d7d20ed1c..89c64b02e0cb5 100644 --- a/doc/source/whatsnew/v2.0.3.rst +++ b/doc/source/whatsnew/v2.0.3.rst @@ -22,6 +22,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` for the ``"c"`` and ``"python"`` engines (:issue:`53390`) +- Bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` with ``expand=True`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`53532`) +- .. --------------------------------------------------------------------------- .. _whatsnew_203.other: From 970a2072069c00e65101d7d5d8f9ff708cc3cff4 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 6 Jun 2023 07:22:03 -0400 Subject: [PATCH 3/4] min versions --- pandas/core/strings/accessor.py | 34 +++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 9acaf660bf841..5d800c830f666 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -275,22 +275,40 @@ def _wrap_result( if isinstance(result.dtype, ArrowDtype): import pyarrow as pa + from pandas.compat import pa_version_under11p0 + from pandas.core.arrays.arrow.array import ArrowExtensionArray - max_len = pa.compute.max( - result._pa_array.combine_chunks().value_lengths() - ).as_py() - if result.isna().any(): + value_lengths = result._pa_array.combine_chunks().value_lengths() + max_len = pa.compute.max(value_lengths).as_py() + min_len = pa.compute.min(value_lengths).as_py() + if result._hasna: # ArrowExtensionArray.fillna doesn't work for list scalars result._pa_array = result._pa_array.fill_null([None] * max_len) + if min_len < max_len: + # append nulls to each scalar list element up to max_len + if not pa_version_under11p0: + result._pa_array = pa.compute.list_slice( + result._pa_array, + start=0, + stop=max_len, + return_fixed_size_list=True, + ) + else: + all_null = np.full(max_len, fill_value=None, dtype=object) + values = result.to_numpy() + new_values = [] + for row in values: + if len(row) < max_len: + nulls = all_null[: max_len - len(row)] + row = np.append(row, nulls) + new_values.append(row) + pa_type = result._pa_array.type + result._pa_array = pa.array(new_values, type=pa_type) if name is not None: labels = name else: labels = range(max_len) - # append nulls to each scalar list element up to max_len - result._pa_array = pa.compute.list_slice( - result._pa_array, start=0, stop=max_len, return_fixed_size_list=True - ) result = { label: ArrowExtensionArray(pa.array(res)) for label, res in zip(labels, (zip(*result.tolist()))) From ae9bcea4f43841d9333fc35ccba1c6d6dc851804 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 6 Jun 2023 17:46:32 -0400 Subject: [PATCH 4/4] ensure ArrowExtensionArray --- pandas/core/strings/accessor.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 5d800c830f666..127ad5e962b16 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -284,15 +284,19 @@ def _wrap_result( min_len = pa.compute.min(value_lengths).as_py() if result._hasna: # ArrowExtensionArray.fillna doesn't work for list scalars - result._pa_array = result._pa_array.fill_null([None] * max_len) + result = ArrowExtensionArray( + result._pa_array.fill_null([None] * max_len) + ) if min_len < max_len: # append nulls to each scalar list element up to max_len if not pa_version_under11p0: - result._pa_array = pa.compute.list_slice( - result._pa_array, - start=0, - stop=max_len, - return_fixed_size_list=True, + result = ArrowExtensionArray( + pa.compute.list_slice( + result._pa_array, + start=0, + stop=max_len, + return_fixed_size_list=True, + ) ) else: all_null = np.full(max_len, fill_value=None, dtype=object) @@ -304,7 +308,7 @@ def _wrap_result( row = np.append(row, nulls) new_values.append(row) pa_type = result._pa_array.type - result._pa_array = pa.array(new_values, type=pa_type) + result = ArrowExtensionArray(pa.array(new_values, type=pa_type)) if name is not None: labels = name else: