-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
API (string dtype): implement hierarchy (NA > NaN, pyarrow > python) for consistent comparisons between different string dtypes #61138
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
3c4d782
7ffb08f
48907c3
2058120
4ebd93b
33db5d0
51340a9
e2bfe18
5ba3577
846afff
99475e6
b481d7a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ | |
|
||
from pandas._config import using_string_dtype | ||
|
||
from pandas.compat import HAS_PYARROW | ||
from pandas.compat.pyarrow import ( | ||
pa_version_under12p0, | ||
pa_version_under19p0, | ||
|
@@ -45,6 +46,25 @@ def cls(dtype): | |
return dtype.construct_array_type() | ||
|
||
|
||
def string_dtype_highest_priority(dtype1, dtype2): | ||
if HAS_PYARROW: | ||
DTYPE_HIERARCHY = [ | ||
pd.StringDtype("python", na_value=np.nan), | ||
pd.StringDtype("pyarrow", na_value=np.nan), | ||
pd.StringDtype("python", na_value=pd.NA), | ||
pd.StringDtype("pyarrow", na_value=pd.NA), | ||
] | ||
else: | ||
DTYPE_HIERARCHY = [ | ||
pd.StringDtype("python", na_value=np.nan), | ||
pd.StringDtype("python", na_value=pd.NA), | ||
] | ||
|
||
h1 = DTYPE_HIERARCHY.index(dtype1) | ||
h2 = DTYPE_HIERARCHY.index(dtype2) | ||
return DTYPE_HIERARCHY[max(h1, h2)] | ||
|
||
|
||
def test_dtype_constructor(): | ||
pytest.importorskip("pyarrow") | ||
|
||
|
@@ -319,37 +339,41 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): | |
tm.assert_extension_array_equal(result, expected) | ||
|
||
|
||
def test_comparison_methods_array(comparison_op, dtype): | ||
def test_comparison_methods_array(comparison_op, dtype, dtype2): | ||
op_name = f"__{comparison_op.__name__}__" | ||
|
||
a = pd.array(["a", None, "c"], dtype=dtype) | ||
other = [None, None, "c"] | ||
result = getattr(a, op_name)(other) | ||
if dtype.na_value is np.nan: | ||
other = pd.array([None, None, "c"], dtype=dtype2) | ||
result = comparison_op(a, other) | ||
|
||
# ensure operation is commutative | ||
result2 = comparison_op(other, a) | ||
tm.assert_equal(result, result2) | ||
|
||
if dtype.na_value is np.nan and dtype2.na_value is np.nan: | ||
if operator.ne == comparison_op: | ||
expected = np.array([True, True, False]) | ||
else: | ||
expected = np.array([False, False, False]) | ||
expected[-1] = getattr(other[-1], op_name)(a[-1]) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
result = getattr(a, op_name)(pd.NA) | ||
if operator.ne == comparison_op: | ||
expected = np.array([True, True, True]) | ||
else: | ||
max_dtype = string_dtype_highest_priority(dtype, dtype2) | ||
if max_dtype.storage == "python": | ||
expected_dtype = "boolean" | ||
else: | ||
expected = np.array([False, False, False]) | ||
tm.assert_numpy_array_equal(result, expected) | ||
expected_dtype = "bool[pyarrow]" | ||
|
||
else: | ||
expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" | ||
expected = np.full(len(a), fill_value=None, dtype="object") | ||
expected[-1] = getattr(other[-1], op_name)(a[-1]) | ||
expected = pd.array(expected, dtype=expected_dtype) | ||
tm.assert_extension_array_equal(result, expected) | ||
|
||
result = getattr(a, op_name)(pd.NA) | ||
expected = pd.array([None, None, None], dtype=expected_dtype) | ||
tm.assert_extension_array_equal(result, expected) | ||
# # with list | ||
# other = [None, None, "c"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you want to implement testing this in this PR? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this was already implemented, just need to add this case back to the test. The original "array" test was actually testing with a list. I updated the test to now actually use an array (parametrized with all the different dtypes, to get all combinations of dtypes in both operands), and added a separate test with just the list. |
||
# result3 = getattr(a, op_name)(other) | ||
# tm.assert_equal(result, result3) | ||
|
||
|
||
def test_constructor_raises(cls): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For this case of comparing with NA, we already have a dedicated test just above, so removing it here