diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1d55fc3ed7b84..addaf81e1a3e5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -651,6 +651,8 @@ Conversion - Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`) - Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`) +- Bug in :meth:`Ops.logical_op` not correctly casting numpy-backed string arrays to boolean when used in logical operations with other boolean arrays (:issue:`60234`) +- Bug in :meth:`ArrowExtensionArray._evaluate_op_method` not correctly casting pyarrow-backed string arrays to boolean when used in logical operations with other boolean arrays (:issue:`60234`) Strings ^^^^^^^ diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e0c93db0afb07..62a8e449b2b05 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -90,12 +90,12 @@ } ARROW_LOGICAL_FUNCS = { - "and_": pc.and_kleene, - "rand_": lambda x, y: pc.and_kleene(y, x), - "or_": pc.or_kleene, - "ror_": lambda x, y: pc.or_kleene(y, x), - "xor": pc.xor, - "rxor": lambda x, y: pc.xor(y, x), + "and_": lambda x, y: pc.and_kleene(*cast_for_logical(x, y)), + "rand_": lambda x, y: pc.and_kleene(*cast_for_logical(y, x)), + "or_": lambda x, y: pc.or_kleene(*cast_for_logical(x, y)), + "ror_": lambda x, y: pc.or_kleene(*cast_for_logical(y, x)), + "xor": lambda x, y: pc.xor(*cast_for_logical(x, y)), + "rxor": lambda x, y: pc.xor(*cast_for_logical(y, x)), } ARROW_BIT_WISE_FUNCS = { @@ -107,6 +107,20 @@ "rxor": lambda x, y: pc.bit_wise_xor(y, x), } + def convert_string_to_boolean_array(arr): + if pa.types.is_string(arr.type) or pa.types.is_large_string(arr.type): + string_to_bool = [bool(value.as_py()) for value in arr] + arr = pc.cast(string_to_bool, pa.bool_()) + return arr + + def cast_for_logical(x, y): + is_x_bool = pa.types.is_boolean(x.type) + is_y_bool = pa.types.is_boolean(y.type) + + if (is_x_bool != is_y_bool): + return convert_string_to_boolean_array(x), convert_string_to_boolean_array(y) + return x, y + def cast_for_truediv( arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar ) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]: @@ -822,6 +836,13 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: result = pc_func(self._pa_array, other) except pa.ArrowNotImplementedError as err: raise TypeError(self._op_method_error_message(other_original, op)) from err + + if (op.__name__ in ARROW_LOGICAL_FUNCS + and (isinstance(self, pa.lib.BooleanArray) != + isinstance(other, pa.lib.BooleanArray)) + ): + return pc.cast(result, pa.bool_()) + return type(self)(result) def _logical_method(self, other, op) -> Self: diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 983a3df57e369..fd900726ecc47 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -435,6 +435,13 @@ def fill_bool(x, left=None): rvalues = right if should_extension_dispatch(lvalues, rvalues): + # Must cast if logical op between a boolean array and numpy-backed string array + if ((lvalues.dtype == np.bool_ and rvalues.dtype == "string[python]") + or (lvalues.dtype == "string[python]" and rvalues.dtype == np.bool_) + ): + lvalues = lvalues.astype(bool) + rvalues = rvalues.astype(bool) + # Call the method on lvalues res_values = op(lvalues, rvalues) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index a32ac7db4656a..63b221d1fea24 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -740,3 +740,54 @@ def test_tolist(dtype): result = arr.tolist() expected = vals tm.assert_equal(result, expected) + +@pytest.mark.parametrize("dtype", ["string[pyarrow]"]) +def test_or_pyarrow_string(dtype): + with pd.option_context("future.infer_string", True): + ser1 = pd.Series([False, False]) + ser2 = pd.Series(["", "b"], dtype=dtype) + result = ser1 | ser2 + expected = pd.Series([False, True], dtype=bool) + tm.assert_series_equal(result, expected) + +@pytest.mark.parametrize("dtype", ["string[pyarrow]"]) +def test_and_pyarrow_string(dtype): + with pd.option_context("future.infer_string", True): + ser1 = pd.Series([False, False]) + ser2 = pd.Series(["", "b"], dtype=dtype) + result = ser1 & ser2 + expected = pd.Series([False, False], dtype=bool) + tm.assert_series_equal(result, expected) + +@pytest.mark.parametrize("dtype", ["string[pyarrow]"]) +def test_xor_pyarrow_string(dtype): + with pd.option_context("future.infer_string", True): + ser1 = pd.Series([False, False]) + ser2 = pd.Series(["", "b"], dtype=dtype) + result = ser1 ^ ser2 + expected = pd.Series([False, True], dtype=bool) + tm.assert_series_equal(result, expected) + +@pytest.mark.parametrize("dtype", ["string[python]"]) +def test_or_numpy_string(dtype): + ser1 = pd.Series([False, False]) + ser2 = pd.Series(["", "b"], dtype=dtype) + result = ser1 | ser2 + expected = pd.Series([False, True], dtype=bool) + tm.assert_series_equal(result, expected) + +@pytest.mark.parametrize("dtype", ["string[python]"]) +def test_and_numpy_string(dtype): + ser1 = pd.Series([False, False]) + ser2 = pd.Series(["", "b"], dtype=dtype) + result = ser1 & ser2 + expected = pd.Series([False, False], dtype=bool) + tm.assert_series_equal(result, expected) + +@pytest.mark.parametrize("dtype", ["string[python]"]) +def test_xor_numpy_string(dtype): + ser1 = pd.Series([False, False]) + ser2 = pd.Series(["", "b"], dtype=dtype) + result = ser1 ^ ser2 + expected = pd.Series([False, True], dtype=bool) + tm.assert_series_equal(result, expected) \ No newline at end of file