From ebcf567f01dd5a40f5e6d4de2d6885b904759aa4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 22 Nov 2023 23:42:10 +0100 Subject: [PATCH 1/3] BUG: ne comparison returns False for NA and other value --- doc/source/whatsnew/v2.1.4.rst | 2 +- pandas/core/arrays/string_arrow.py | 6 +++++- pandas/tests/arithmetic/test_object.py | 29 ++++++++++++++++---------- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 543a9864ced26..bbf7f044245bf 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -25,7 +25,7 @@ Bug fixes - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) -- +- Fixed bug in :meth:`Series.__ne__` resulting in False for comparison between ``NA`` and string value for ``dtype="string[pyarrow_numpy]"`` (:issue:`56122`) .. --------------------------------------------------------------------------- .. _whatsnew_214.other: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 96ebb4901f797..976a8d3c32b23 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import partial +import operator import re from typing import ( TYPE_CHECKING, @@ -663,7 +664,10 @@ def _convert_int_dtype(self, result): def _cmp_method(self, other, op): result = super()._cmp_method(other, op) - return result.to_numpy(np.bool_, na_value=False) + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) + else: + return result.to_numpy(np.bool_, na_value=False) def value_counts(self, dropna: bool = True) -> Series: from pandas import Series diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 5ffbf1a38e845..76e38500df5a2 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,10 +8,13 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Series, Timestamp, + option_context, ) import pandas._testing as tm from pandas.core import ops @@ -31,20 +34,24 @@ def test_comparison_object_numeric_nas(self, comparison_op): expected = func(ser.astype(float), shifted.astype(float)) tm.assert_series_equal(result, expected) - def test_object_comparisons(self): - ser = Series(["a", "b", np.nan, "c", "a"]) + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_object_comparisons(self, infer_string): + with option_context("future.infer_string", infer_string): + ser = Series(["a", "b", np.nan, "c", "a"]) - result = ser == "a" - expected = Series([True, False, False, False, True]) - tm.assert_series_equal(result, expected) + result = ser == "a" + expected = Series([True, False, False, False, True]) + tm.assert_series_equal(result, expected) - result = ser < "a" - expected = Series([False, False, False, False, False]) - tm.assert_series_equal(result, expected) + result = ser < "a" + expected = Series([False, False, False, False, False]) + tm.assert_series_equal(result, expected) - result = ser != "a" - expected = -(ser == "a") - tm.assert_series_equal(result, expected) + result = ser != "a" + expected = -(ser == "a") + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [None, object]) def test_more_na_comparisons(self, dtype): From 915a990d8e0cbf6e8cc16179791c8ac4773fd490 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 23 Nov 2023 00:18:58 +0100 Subject: [PATCH 2/3] Fix --- pandas/tests/arrays/string_/test_string.py | 26 +++++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 524a6632e5544..2877f7e55a19f 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -2,6 +2,8 @@ This module tests the functionality of StringArray and ArrowStringArray. Tests for the str accessors are in pandas/tests/strings/test_string_array.py """ +import operator + import numpy as np import pytest @@ -229,7 +231,10 @@ def test_comparison_methods_scalar(comparison_op, dtype): result = getattr(a, op_name)(other) if dtype.storage == "pyarrow_numpy": expected = np.array([getattr(item, op_name)(other) for item in a]) - expected[1] = False + if comparison_op == operator.ne: + expected[1] = True + else: + expected[1] = False tm.assert_numpy_array_equal(result, expected.astype(np.bool_)) else: expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" @@ -244,7 +249,10 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype): result = getattr(a, op_name)(pd.NA) if dtype.storage == "pyarrow_numpy": - expected = np.array([False, False, False]) + if operator.ne == comparison_op: + expected = np.array([True, True, True]) + else: + expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) else: expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" @@ -270,7 +278,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): if dtype.storage == "pyarrow_numpy": expected_data = { "__eq__": [False, False, False], - "__ne__": [True, False, True], + "__ne__": [True, True, True], }[op_name] expected = np.array(expected_data) tm.assert_numpy_array_equal(result, expected) @@ -290,12 +298,18 @@ def test_comparison_methods_array(comparison_op, dtype): other = [None, None, "c"] result = getattr(a, op_name)(other) if dtype.storage == "pyarrow_numpy": - expected = np.array([False, False, False]) - expected[-1] = getattr(other[-1], op_name)(a[-1]) + if operator.ne == comparison_op: + expected = np.array([True, True, False]) + else: + expected = np.array([False, False, False]) + expected[-1] = getattr(other[-1], op_name)(a[-1]) tm.assert_numpy_array_equal(result, expected) result = getattr(a, op_name)(pd.NA) - expected = np.array([False, False, False]) + if operator.ne == comparison_op: + expected = np.array([True, True, True]) + else: + expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) else: From 76ef3c2249303526729f5f6d1b9f1390e2779dc2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 7 Dec 2023 17:35:59 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/tests/arithmetic/test_object.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 4ec92ac1c8bb8..4ffd76722286a 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,9 +8,9 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype import pandas.util._test_decorators as td -from pandas._config import using_pyarrow_string_dtype import pandas as pd from pandas import (