From 40bec0df90b5ab63799515e65fa353e0824f54f0 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 29 Apr 2021 20:33:17 +0100 Subject: [PATCH] [ArrowStringArray] startswith/endswith using native pyarrow method --- pandas/core/arrays/string_arrow.py | 21 +++++++++++++ pandas/tests/strings/test_find_replace.py | 38 +++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 180ed51e7fd2b..4118a172c9dac 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,6 +1,7 @@ from __future__ import annotations from distutils.version import LooseVersion +import re from typing import ( TYPE_CHECKING, Any, @@ -763,6 +764,26 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): else: return super()._str_contains(pat, case, flags, na, regex) + def _str_startswith(self, pat, na=None): + if hasattr(pc, "match_substring_regex"): + result = pc.match_substring_regex(self._data, "^" + re.escape(pat)) + result = BooleanDtype().__from_arrow__(result) + if not isna(na): + result[isna(result)] = bool(na) + return result + else: + return super()._str_startswith(pat, na) + + def _str_endswith(self, pat, na=None): + if hasattr(pc, "match_substring_regex"): + result = pc.match_substring_regex(self._data, re.escape(pat) + "$") + result = BooleanDtype().__from_arrow__(result) + if not isna(na): + result[isna(result)] = bool(na) + return result + else: + return super()._str_endswith(pat, na) + def _str_isalnum(self): if hasattr(pc, "utf8_is_alnum"): result = pc.utf8_is_alnum(self._data) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index d801d3457027f..0c54042d983ad 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -200,6 +200,25 @@ def test_startswith(dtype, null_value, na): tm.assert_series_equal(rs, xp) +@pytest.mark.parametrize("na", [None, True, False]) +def test_startswith_nullable_string_dtype(nullable_string_dtype, na): + values = Series( + ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], + dtype=nullable_string_dtype, + ) + result = values.str.startswith("foo", na=na) + exp = Series( + [False, na, True, False, False, na, True, False, False], dtype="boolean" + ) + tm.assert_series_equal(result, exp) + + result = values.str.startswith("rege.", na=na) + exp = Series( + [False, na, False, False, False, na, False, False, True], dtype="boolean" + ) + tm.assert_series_equal(result, exp) + + @pytest.mark.parametrize("dtype", [None, "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) @@ -228,6 +247,25 @@ def test_endswith(dtype, null_value, na): tm.assert_series_equal(rs, xp) +@pytest.mark.parametrize("na", [None, True, False]) +def test_endswith_nullable_string_dtype(nullable_string_dtype, na): + values = Series( + ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], + dtype=nullable_string_dtype, + ) + result = values.str.endswith("foo", na=na) + exp = Series( + [False, na, False, False, True, na, True, False, False], dtype="boolean" + ) + tm.assert_series_equal(result, exp) + + result = values.str.endswith("rege.", na=na) + exp = Series( + [False, na, False, False, False, na, False, False, True], dtype="boolean" + ) + tm.assert_series_equal(result, exp) + + def test_replace(): values = Series(["fooBAD__barBAD", np.nan])