From 3629f971da82e747a421f96325d2f6b67c6642a4 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 20 Dec 2023 13:33:52 -0500 Subject: [PATCH 1/6] support tuple in startswith/endswith for arrow strings --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 26 ++++++++++++++++++++++++-- pandas/tests/extension/test_arrow.py | 12 +++++++----- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 8c475791df64d..48b7a6811dc13 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -601,6 +601,7 @@ Strings - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) - Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) +- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`51970`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) Interval diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 633efe43fce1a..e7a78ae386f70 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2175,13 +2175,35 @@ def _str_contains( return type(self)(result) def _str_startswith(self, pat: str, na=None): - result = pc.starts_with(self._pa_array, pattern=pat) + if isinstance(pat, str): + result = pc.starts_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.starts_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) return type(self)(result) def _str_endswith(self, pat: str, na=None): - result = pc.ends_with(self._pa_array, pattern=pat) + if isinstance(pat, str): + result = pc.ends_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.ends_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) return type(self)(result) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 674a5da216011..98e6ecce50e8a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1801,14 +1801,16 @@ def test_str_contains_flags_unsupported(): @pytest.mark.parametrize( "side, pat, na, exp", [ - ["startswith", "ab", None, [True, None]], - ["startswith", "b", False, [False, False]], - ["endswith", "b", True, [False, True]], - ["endswith", "bc", None, [True, None]], + ["startswith", "ab", None, [True, None, False]], + ["startswith", "b", False, [False, False, False]], + ["endswith", "b", True, [False, True, False]], + ["endswith", "bc", None, [True, None, False]], + ["startswith", ("a", "e", "g"), None, [True, None, True]], + ["endswith", ("a", "c", "g"), None, [True, None, True]], ], ) def test_str_start_ends_with(side, pat, na, exp): - ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) + ser = pd.Series(["abc", None, "efg"], dtype=ArrowDtype(pa.string())) result = getattr(ser.str, side)(pat, na=na) expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) From 9100d45c3abe26f6e61c117c8cdaf0b95d8ca3ed Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 20 Dec 2023 13:38:05 -0500 Subject: [PATCH 2/6] gh --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 48b7a6811dc13..645a3cdb967ca 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -601,7 +601,7 @@ Strings - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) - Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) -- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`51970`) +- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) Interval From 618387f8a8731e479dd70ada7dc3cb544dce1063 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 20 Dec 2023 13:39:47 -0500 Subject: [PATCH 3/6] improve test --- pandas/tests/extension/test_arrow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 98e6ecce50e8a..40038ec08a666 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1807,6 +1807,8 @@ def test_str_contains_flags_unsupported(): ["endswith", "bc", None, [True, None, False]], ["startswith", ("a", "e", "g"), None, [True, None, True]], ["endswith", ("a", "c", "g"), None, [True, None, True]], + ["startswith", (), None, [False, None, False]], + ["endswith", (), None, [False, None, False]], ], ) def test_str_start_ends_with(side, pat, na, exp): From 756b4e2f6838e514d08593f96d92cdb4ced4e535 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 20 Dec 2023 14:05:42 -0500 Subject: [PATCH 4/6] improve tests --- pandas/tests/extension/test_arrow.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 40038ec08a666..28ee5446d4b5c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1818,6 +1818,16 @@ def test_str_start_ends_with(side, pat, na, exp): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("side", ("startswith", "endswith")) +def test_str_starts_ends_with_all_nulls_empty_tuple(side): + ser = pd.Series([None, None], dtype=ArrowDtype(pa.string())) + result = getattr(ser.str, side)(()) + + # bool datatype preserved for all nulls. + expected = pd.Series([None, None], dtype=ArrowDtype(pa.bool_())) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "arg_name, arg", [["pat", re.compile("b")], ["repl", str], ["case", False], ["flags", 1]], From 4cfb7300370f107a76381ccc85b627abd8e44c64 Mon Sep 17 00:00:00 2001 From: rohanjain101 <38412262+rohanjain101@users.noreply.github.com> Date: Thu, 21 Dec 2023 01:08:59 -0500 Subject: [PATCH 5/6] Update pandas/core/arrays/arrow/array.py Co-authored-by: Xiao Yuan --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e7a78ae386f70..463b88c4e0024 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2191,7 +2191,7 @@ def _str_startswith(self, pat: str, na=None): result = result.fill_null(na) return type(self)(result) - def _str_endswith(self, pat: str, na=None): + def _str_endswith(self, pat: str | tuple[str, ...], na=None): if isinstance(pat, str): result = pc.ends_with(self._pa_array, pattern=pat) else: From 6f1d2694bc20db5eabd452e5c5f5052942342003 Mon Sep 17 00:00:00 2001 From: rohanjain101 <38412262+rohanjain101@users.noreply.github.com> Date: Thu, 21 Dec 2023 01:09:05 -0500 Subject: [PATCH 6/6] Update pandas/core/arrays/arrow/array.py Co-authored-by: Xiao Yuan --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 463b88c4e0024..de1fb0ec5d4d5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2174,7 +2174,7 @@ def _str_contains( result = result.fill_null(na) return type(self)(result) - def _str_startswith(self, pat: str, na=None): + def _str_startswith(self, pat: str | tuple[str, ...], na=None): if isinstance(pat, str): result = pc.starts_with(self._pa_array, pattern=pat) else: