diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 188494c7c60db..3cbc04fb2f5cd 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -230,8 +230,8 @@ def generate_bins_dt64( hasnans: bool = ..., ) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1] def array_equivalent_object( - left: np.ndarray, # object[:] - right: np.ndarray, # object[:] + left: npt.NDArray[np.object_], + right: npt.NDArray[np.object_], ) -> bool: ... def has_infs(arr: np.ndarray) -> bool: ... # const floating[:] def get_reverse_indexer( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3769bbf087fee..5ef42228d8029 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -15,6 +15,7 @@ from cpython.iterator cimport PyIter_Check from cpython.number cimport PyNumber_Check from cpython.object cimport ( Py_EQ, + PyObject, PyObject_RichCompareBool, PyTypeObject, ) @@ -571,25 +572,42 @@ def maybe_booleans_to_slice(ndarray[uint8_t, ndim=1] mask): @cython.wraparound(False) @cython.boundscheck(False) -def array_equivalent_object(left: object[:], right: object[:]) -> bool: +def array_equivalent_object(ndarray left, ndarray right) -> bool: """ - Perform an element by element comparison on 1-d object arrays + Perform an element by element comparison on N-d object arrays taking into account nan positions. """ + # left and right both have object dtype, but we cannot annotate that + # without limiting ndim. cdef: - Py_ssize_t i, n = left.shape[0] + Py_ssize_t i, n = left.size object x, y + cnp.broadcast mi = cnp.PyArray_MultiIterNew2(left, right) + + # Caller is responsible for checking left.shape == right.shape for i in range(n): - x = left[i] - y = right[i] + # Analogous to: x = left[i] + x = (cnp.PyArray_MultiIter_DATA(mi, 0))[0] + y = (cnp.PyArray_MultiIter_DATA(mi, 1))[0] # we are either not equal or both nan # I think None == None will be true here try: if PyArray_Check(x) and PyArray_Check(y): - if not array_equivalent_object(x, y): + if x.shape != y.shape: return False + if x.dtype == y.dtype == object: + if not array_equivalent_object(x, y): + return False + else: + # Circular import isn't great, but so it goes. + # TODO: could use np.array_equal? + from pandas.core.dtypes.missing import array_equivalent + + if not array_equivalent(x, y): + return False + elif (x is C_NA) ^ (y is C_NA): return False elif not ( @@ -612,6 +630,8 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: return False raise + cnp.PyArray_MultiIter_NEXT(mi) + return True diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 375d05bdf11ff..a225d2cd12eac 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -565,16 +565,7 @@ def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bo if not strict_nan: # isna considers NaN and None to be equivalent. - if left.flags["F_CONTIGUOUS"] and right.flags["F_CONTIGUOUS"]: - # we can improve performance by doing a copy-free ravel - # e.g. in frame_methods.Equals.time_frame_nonunique_equal - # if we transposed the frames - left = left.ravel("K") - right = right.ravel("K") - - return lib.array_equivalent_object( - ensure_object(left.ravel()), ensure_object(right.ravel()) - ) + return lib.array_equivalent_object(ensure_object(left), ensure_object(right)) for left_value, right_value in zip(left, right): if left_value is NaT and right_value is not NaT: diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index cc365bef2b183..21c49807b7743 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -466,6 +466,27 @@ def test_array_equivalent_series(val): assert not array_equivalent(Series([arr, arr]), Series([arr, val])) +def test_array_equivalent_array_mismatched_shape(): + # to trigger the motivating bug, the first N elements of the arrays need + # to match + first = np.array([1, 2, 3]) + second = np.array([1, 2]) + + left = Series([first, "a"], dtype=object) + right = Series([second, "a"], dtype=object) + assert not array_equivalent(left, right) + + +def test_array_equivalent_array_mismatched_dtype(): + # same shape, different dtype can still be equivalent + first = np.array([1, 2], dtype=np.float64) + second = np.array([1, 2]) + + left = Series([first, "a"], dtype=object) + right = Series([second, "a"], dtype=object) + assert array_equivalent(left, right) + + def test_array_equivalent_different_dtype_but_equal(): # Unclear if this is exposed anywhere in the public-facing API assert array_equivalent(np.array([1, 2]), np.array([1.0, 2.0]))