From 8dfb6cb0471ec567f15ddc1ef5efa0ab6d1e8a75 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 20 Aug 2024 13:54:42 -0700 Subject: [PATCH 1/5] REF (string): de-duplicate str_endswith, startswith --- pandas/core/arrays/_arrow_string_mixins.py | 65 +++++++++++++++++++++- pandas/core/arrays/arrow/array.py | 33 +---------- pandas/core/arrays/string_arrow.py | 39 +------------ 3 files changed, 65 insertions(+), 72 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 341ac2c0b48ec..7add32aae47d7 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -9,16 +9,27 @@ from pandas.compat import pa_version_under10p1 +from pandas.core.dtypes.missing import isna + if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc if TYPE_CHECKING: - from pandas._typing import Self + from collections.abc import Sized + + from pandas._typing import ( + Scalar, + Self, + ) class ArrowStringArrayMixin: - _pa_array = None + # _object_compat specifies whether we should 1) attempt to match behaviors + # of the object-backed StringDtype and 2) fall back to object-based + # computation for cases that pyarrow does not support natively. + _object_compat = False + _pa_array: Sized def __init__(self, *args, **kwargs) -> None: raise NotImplementedError @@ -97,3 +108,53 @@ def _str_removesuffix(self, suffix: str): removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) result = pc.if_else(ends_with, removed, self._pa_array) return type(self)(result) + + def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.starts_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + if self._object_compat: + # mimic existing behaviour of string extension array + # and python string method + result = pa.array( + np.zeros(len(self._pa_array), dtype=np.bool_), + mask=isna(self._pa_array), + ) + else: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.starts_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) + if not isna(na): + result = result.fill_null(na) + return self._convert_bool_result(result) + + def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.ends_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + if self._object_compat: + # mimic existing behaviour of string extension array + # and python string method + result = pa.array( + np.zeros(len(self._pa_array), dtype=np.bool_), + mask=isna(self._pa_array), + ) + else: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.ends_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) + if not isna(na): + result = result.fill_null(na) + return self._convert_bool_result(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index fbffb4a0a9990..fa778aab71349 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2337,38 +2337,7 @@ def _str_contains( result = result.fill_null(na) return type(self)(result) - def _str_startswith(self, pat: str | tuple[str, ...], na=None) -> Self: - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return type(self)(result) - - def _str_endswith(self, pat: str | tuple[str, ...], na=None) -> Self: - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) + def _result_converter(self, result): return type(self)(result) def _str_replace( diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 15807c365ecfd..061424859a0ed 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -278,6 +278,7 @@ def astype(self, dtype, copy: bool = True): # ------------------------------------------------------------------------ # String methods interface + _object_compat = True _str_map = BaseStringArray._str_map @@ -298,44 +299,6 @@ def _str_contains( result[isna(result)] = bool(na) return result - def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.starts_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) - ) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._convert_bool_result(result) - - def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): - if isinstance(pat, str): - result = pc.ends_with(self._pa_array, pattern=pat) - else: - if len(pat) == 0: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) - ) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) - - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): - result = result.fill_null(na) - return self._convert_bool_result(result) - def _str_replace( self, pat: str | re.Pattern, From 78bc2244288d78a638cb4d4fb0cc89021dee540b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 22 Aug 2024 09:24:46 -0700 Subject: [PATCH 2/5] specify override --- pandas/core/arrays/string_arrow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 061424859a0ed..0084f7ac548fb 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -281,6 +281,8 @@ def astype(self, dtype, copy: bool = True): _object_compat = True _str_map = BaseStringArray._str_map + _str_startswith = ArrowStringArrayMixin._str_startswith + _str_endswith = ArrowStringArrayMixin._str_endswith def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True From dcb77b60595663faad3532c9b9aea88485d642a6 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Aug 2024 07:38:20 -0700 Subject: [PATCH 3/5] No need for _object_compat --- pandas/core/arrays/_arrow_string_mixins.py | 32 ++++------------------ 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 7add32aae47d7..77cfb8bdee6c5 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -25,10 +25,6 @@ class ArrowStringArrayMixin: - # _object_compat specifies whether we should 1) attempt to match behaviors - # of the object-backed StringDtype and 2) fall back to object-based - # computation for cases that pyarrow does not support natively. - _object_compat = False _pa_array: Sized def __init__(self, *args, **kwargs) -> None: @@ -114,17 +110,9 @@ def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): result = pc.starts_with(self._pa_array, pattern=pat) else: if len(pat) == 0: - if self._object_compat: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=np.bool_), - mask=isna(self._pa_array), - ) - else: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) + # For empty tuple we return null for missing values and False + # for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) else: result = pc.starts_with(self._pa_array, pattern=pat[0]) @@ -139,17 +127,9 @@ def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): result = pc.ends_with(self._pa_array, pattern=pat) else: if len(pat) == 0: - if self._object_compat: - # mimic existing behaviour of string extension array - # and python string method - result = pa.array( - np.zeros(len(self._pa_array), dtype=np.bool_), - mask=isna(self._pa_array), - ) - else: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) + # For empty tuple we return null for missing values and False + # for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) else: result = pc.ends_with(self._pa_array, pattern=pat[0]) From 16ac7fd45f9e3ed50554afb30a431b83a02e7b55 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Aug 2024 08:32:16 -0700 Subject: [PATCH 4/5] pyright ignore --- pandas/core/arrays/_arrow_string_mixins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 77cfb8bdee6c5..c810af32f7480 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -118,7 +118,7 @@ def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): for p in pat[1:]: result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): + if not isna(na): # pyright: ignore [reportGeneralTypeIssues] result = result.fill_null(na) return self._convert_bool_result(result) @@ -135,6 +135,6 @@ def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): for p in pat[1:]: result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): + if not isna(na): # pyright: ignore [reportGeneralTypeIssues] result = result.fill_null(na) return self._convert_bool_result(result) From cdaa99baff7036fdd9527a751cbcbf33dac49a15 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Aug 2024 10:07:43 -0700 Subject: [PATCH 5/5] CLN: remove no-longer-needed object_compat --- pandas/core/arrays/string_arrow.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 0084f7ac548fb..cfc892b9e3648 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -278,7 +278,6 @@ def astype(self, dtype, copy: bool = True): # ------------------------------------------------------------------------ # String methods interface - _object_compat = True _str_map = BaseStringArray._str_map _str_startswith = ArrowStringArrayMixin._str_startswith