diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 5237f4e79079d..e6245ef3d2876 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -861,6 +861,7 @@ Performance improvements - Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`) - Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) - Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsetes (:issue:`35296`) +- Performance improvement in :func:`isna` and :func:`isnull` (:issue:`50658`) - Performance improvement in :meth:`.SeriesGroupBy.value_counts` with categorical dtype (:issue:`46202`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 77876d0c55337..81fd5792ee9e5 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1366,7 +1366,7 @@ def rank_2d( nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, 0) if numeric_object_t is object: - mask = missing.isnaobj2d(values).view(np.uint8) + mask = missing.isnaobj(values).view(np.uint8) elif numeric_object_t is float64_t or numeric_object_t is float32_t: mask = np.isnan(values).view(np.uint8) else: diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi index 27f227558dee5..d5c9f1342a089 100644 --- a/pandas/_libs/missing.pyi +++ b/pandas/_libs/missing.pyi @@ -13,6 +13,5 @@ def isposinf_scalar(val: object) -> bool: ... def isneginf_scalar(val: object) -> bool: ... def checknull(val: object, inf_as_na: bool = ...) -> bool: ... def isnaobj(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ... -def isnaobj2d(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ... def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ... def is_float_nan(values: np.ndarray) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index a3b0451381ad2..fc94d221a63b9 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -4,10 +4,12 @@ from sys import maxsize cimport cython from cython cimport Py_ssize_t + import numpy as np cimport numpy as cnp from numpy cimport ( + flatiter, float64_t, int64_t, ndarray, @@ -197,56 +199,22 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=False): result : ndarray (dtype=np.bool_) """ cdef: - Py_ssize_t i, n + Py_ssize_t i, n = arr.size object val - ndarray[uint8_t] result - - assert arr.ndim == 1, "'arr' must be 1-D." - - n = len(arr) - result = np.empty(n, dtype=np.uint8) - for i in range(n): - val = arr[i] - result[i] = checknull(val, inf_as_na=inf_as_na) - return result.view(np.bool_) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def isnaobj2d(arr: ndarray, inf_as_na: bool = False) -> ndarray: - """ - Return boolean mask denoting which elements of a 2-D array are na-like, - according to the criteria defined in `checknull`: - - None - - nan - - NaT - - np.datetime64 representation of NaT - - np.timedelta64 representation of NaT - - NA - - Decimal("NaN") - - Parameters - ---------- - arr : ndarray - - Returns - ------- - result : ndarray (dtype=np.bool_) - """ - cdef: - Py_ssize_t i, j, n, m - object val - ndarray[uint8_t, ndim=2] result - - assert arr.ndim == 2, "'arr' must be 2-D." + bint is_null + ndarray result = np.empty((arr).shape, dtype=np.uint8) + flatiter it = cnp.PyArray_IterNew(arr) + flatiter it2 = cnp.PyArray_IterNew(result) - n, m = (arr).shape - result = np.zeros((n, m), dtype=np.uint8) for i in range(n): - for j in range(m): - val = arr[i, j] - if checknull(val, inf_as_na=inf_as_na): - result[i, j] = 1 + # The PyArray_GETITEM and PyArray_ITER_NEXT are faster + # equivalents to `val = values[i]` + val = cnp.PyArray_GETITEM(arr, cnp.PyArray_ITER_DATA(it)) + cnp.PyArray_ITER_NEXT(it) + is_null = checknull(val, inf_as_na=inf_as_na) + # Dereference pointer (set value) + ((cnp.PyArray_ITER_DATA(it2)))[0] = is_null + cnp.PyArray_ITER_NEXT(it2) return result.view(np.bool_) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 000b5ebbdd2f7..493842fc71bd9 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -313,10 +313,8 @@ def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bo result = np.zeros(values.shape, dtype=bool) else: - if values.ndim == 1: + if values.ndim in {1, 2}: result = libmissing.isnaobj(values, inf_as_na=inf_as_na) - elif values.ndim == 2: - result = libmissing.isnaobj2d(values, inf_as_na=inf_as_na) else: # 0-D, reached via e.g. mask_missing result = libmissing.isnaobj(values.ravel(), inf_as_na=inf_as_na) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 94707dc2e68c5..4f0c7d9135629 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -624,9 +624,16 @@ def _check_behavior(self, arr, expected): arr = np.atleast_2d(arr) expected = np.atleast_2d(expected) - result = libmissing.isnaobj2d(arr) + result = libmissing.isnaobj(arr) + tm.assert_numpy_array_equal(result, expected) + result = libmissing.isnaobj(arr, inf_as_na=True) tm.assert_numpy_array_equal(result, expected) - result = libmissing.isnaobj2d(arr, inf_as_na=True) + + # Test fortran order + arr = arr.copy(order="F") + result = libmissing.isnaobj(arr) + tm.assert_numpy_array_equal(result, expected) + result = libmissing.isnaobj(arr, inf_as_na=True) tm.assert_numpy_array_equal(result, expected) def test_basic(self):