From fca29525b434d1a170e6a399cbdba242452c7918 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 11 Feb 2023 15:21:45 -0800 Subject: [PATCH 1/3] ENH: support addition with pyarrow string dtypes --- pandas/core/arrays/arrow/array.py | 17 +++++++++++++++++ pandas/tests/arrays/string_/test_string.py | 17 ++--------------- pandas/tests/extension/test_arrow.py | 8 +++++--- pandas/tests/strings/test_api.py | 11 +---------- 4 files changed, 25 insertions(+), 28 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3a3f0b8ce61be..8021d13be51c2 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1,6 +1,7 @@ from __future__ import annotations from copy import deepcopy +import operator from typing import ( TYPE_CHECKING, Any, @@ -12,6 +13,10 @@ import numpy as np from pandas._libs import lib +from pandas._libs.ops import ( + scalar_binop, + vec_binop, +) from pandas._typing import ( ArrayLike, AxisInt, @@ -47,6 +52,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core import roperator from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray import pandas.core.common as com @@ -454,6 +460,17 @@ def _cmp_method(self, other, op): return BooleanArray(values, mask) def _evaluate_op_method(self, other, op, arrow_funcs): + pa_type = self._data.type + if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [ + operator.add, + roperator.radd, + ]: + if is_scalar(other): + result = scalar_binop(self.to_numpy(), other, op) + else: + result = vec_binop(self.to_numpy(), np.asarray(other), op) + return type(self)._from_sequence(result) + pc_func = arrow_funcs[op.__name__] if pc_func is NotImplemented: raise NotImplementedError(f"{op.__name__} not implemented.") diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 7e17efe4e7380..6d4913cfcba4a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -96,15 +96,7 @@ def test_astype_roundtrip(dtype): tm.assert_series_equal(result, ser) -def test_add(dtype, request): - if dtype.storage == "pyarrow": - reason = ( - "unsupported operand type(s) for +: 'ArrowStringArray' and " - "'ArrowStringArray'" - ) - mark = pytest.mark.xfail(raises=NotImplementedError, reason=reason) - request.node.add_marker(mark) - +def test_add(dtype): a = pd.Series(["a", "b", "c", None, None], dtype=dtype) b = pd.Series(["x", "y", None, "z", None], dtype=dtype) @@ -140,12 +132,7 @@ def test_add_2d(dtype, request): s + b -def test_add_sequence(dtype, request): - if dtype.storage == "pyarrow": - reason = "unsupported operand type(s) for +: 'ArrowStringArray' and 'list'" - mark = pytest.mark.xfail(raises=NotImplementedError, reason=reason) - request.node.add_marker(mark) - +def test_add_sequence(dtype): a = pd.array(["a", "b", None, None], dtype=dtype) other = ["x", None, "y", None] diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 681d048f38485..afbc7f6a0b3dc 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1045,6 +1045,10 @@ def _get_scalar_exception(self, opname, pa_dtype): exc = NotImplementedError elif arrow_temporal_supported: exc = None + elif opname in ["__add__", "__radd__"] and ( + pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) + ): + exc = None elif not (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype)): exc = pa.ArrowNotImplementedError else: @@ -1219,9 +1223,7 @@ def test_add_series_with_extension_array(self, data, request): return if (pa_version_under8p0 and pa.types.is_duration(pa_dtype)) or ( - pa.types.is_binary(pa_dtype) - or pa.types.is_string(pa_dtype) - or pa.types.is_boolean(pa_dtype) + pa.types.is_boolean(pa_dtype) ): request.node.add_marker( pytest.mark.xfail( diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index 7a6c7e69047bc..26d659646fe0a 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -6,7 +6,6 @@ MultiIndex, Series, _testing as tm, - get_option, ) from pandas.core import strings @@ -125,16 +124,8 @@ def test_api_per_method( method(*args, **kwargs) -def test_api_for_categorical(any_string_method, any_string_dtype, request): +def test_api_for_categorical(any_string_method, any_string_dtype): # https://github.com/pandas-dev/pandas/issues/10661 - - if any_string_dtype == "string[pyarrow]" or ( - any_string_dtype == "string" and get_option("string_storage") == "pyarrow" - ): - # unsupported operand type(s) for +: 'ArrowStringArray' and 'str' - mark = pytest.mark.xfail(raises=NotImplementedError, reason="Not Implemented") - request.node.add_marker(mark) - s = Series(list("aabb"), dtype=any_string_dtype) s = s + " " + s c = s.astype("category") From 09bdb562478784cc76d3ceb081999925bf34c4bb Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 15 Feb 2023 16:05:45 -0800 Subject: [PATCH 2/3] use pc.binary_join_element_wise --- pandas/core/arrays/arrow/array.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index eb664b8609efd..df686288aa897 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -13,10 +13,6 @@ import numpy as np from pandas._libs import lib -from pandas._libs.ops import ( - scalar_binop, - vec_binop, -) from pandas._typing import ( ArrayLike, AxisInt, @@ -465,11 +461,21 @@ def _evaluate_op_method(self, other, op, arrow_funcs): operator.add, roperator.radd, ]: + length = self._data.length() + if pa.types.is_string(pa_type): + seps = [""] * length + else: + seps = [b""] * length + if is_scalar(other): - result = scalar_binop(self.to_numpy(), other, op) + other = [other] * length + elif isinstance(other, type(self)): + other = other._data + if op is operator.add: + result = pc.binary_join_element_wise(self._data, other, seps) else: - result = vec_binop(self.to_numpy(), np.asarray(other), op) - return type(self)._from_sequence(result) + result = pc.binary_join_element_wise(other, self._data, seps) + return type(self)(result) pc_func = arrow_funcs[op.__name__] if pc_func is NotImplemented: From 9ee87cb632a349793a4ce4b04c96169dbf4d4a02 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 16 Feb 2023 18:01:41 -0800 Subject: [PATCH 3/3] mypy fixup --- pandas/core/arrays/arrow/array.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8bf879573a771..efd44b1bca3f2 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -467,6 +467,8 @@ def _evaluate_op_method(self, other, op, arrow_funcs): roperator.radd, ]: length = self._data.length() + + seps: list[str] | list[bytes] if pa.types.is_string(pa_type): seps = [""] * length else: