From 7803361518dc7dd95304f6c38ff6ad08fb601fac Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 2 Sep 2023 14:59:01 +0200 Subject: [PATCH] ENH: Implement more string accessors through PyArrow --- pandas/core/arrays/string_arrow.py | 31 ++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index f438f75707265..0fd0979103c44 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -47,6 +47,8 @@ if TYPE_CHECKING: + from collections.abc import Sequence + from pandas._typing import ( Dtype, Scalar, @@ -334,19 +336,13 @@ def _str_startswith(self, pat: str, na=None): result = pc.starts_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = self._result_converter(result) - if not isna(na): - result[isna(result)] = bool(na) - return result + return self._result_converter(result) def _str_endswith(self, pat: str, na=None): result = pc.ends_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = self._result_converter(result) - if not isna(na): - result[isna(result)] = bool(na) - return result + return self._result_converter(result) def _str_replace( self, @@ -365,6 +361,12 @@ def _str_replace( result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) return type(self)(result) + def _str_repeat(self, repeats: int | Sequence[int]): + if not isinstance(repeats, int): + return super()._str_repeat(repeats) + else: + return type(self)(pc.binary_repeat(self._pa_array, repeats)) + def _str_match( self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None ): @@ -379,6 +381,19 @@ def _str_fullmatch( pat = f"{pat}$" return self._str_match(pat, case, flags, na) + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if stop is None: + return super()._str_slice(start, stop, step) + if start is None: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + def _str_isalnum(self): result = pc.utf8_is_alnum(self._pa_array) return self._result_converter(result)