From 69bfe5f9ecb16d2f38b3f98c3097fe5e02722caa Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 19 Jan 2023 00:17:11 +0100 Subject: [PATCH 1/2] PERF: Improve performance for array equal fast --- pandas/_libs/lib.pyi | 4 ++-- pandas/_libs/lib.pyx | 10 ++++------ pandas/core/frame.py | 4 ++-- pandas/core/generic.py | 7 ++----- pandas/core/series.py | 8 +++----- pandas/tests/libs/test_lib.py | 19 ++++++++----------- 6 files changed, 21 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 2439082bf7413..7df632313bf55 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -240,6 +240,6 @@ def get_reverse_indexer( ) -> npt.NDArray[np.intp]: ... def is_bool_list(obj: list) -> bool: ... def dtypes_all_equal(types: list[DtypeObj]) -> bool: ... -def array_equal_fast( - left: np.ndarray, right: np.ndarray # np.ndarray[np.int64, ndim=1] +def indexer_equal_fast( + left: np.ndarray, n: int # np.ndarray[np.int64, ndim=1] ) -> bool: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 176307ef27cff..216105b32a069 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -650,22 +650,20 @@ ctypedef fused int6432_t: @cython.wraparound(False) @cython.boundscheck(False) -def array_equal_fast( - ndarray[int6432_t, ndim=1] left, ndarray[int6432_t, ndim=1] right, -) -> bool: +def indexer_equal_fast(ndarray[int6432_t, ndim=1] left, int n) -> bool: """ Perform an element by element comparison on 1-d integer arrays, meant for indexer comparisons """ cdef: - Py_ssize_t i, n = left.size + Py_ssize_t i - if left.size != right.size: + if left.size != n: return False for i in range(n): - if left[i] != right[i]: + if left[i] != i: return False return True diff --git a/pandas/core/frame.py b/pandas/core/frame.py index eb0eb34dbefc4..36195ea04815f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -48,7 +48,7 @@ from pandas._libs.hashtable import duplicated from pandas._libs.lib import ( NoDefault, - array_equal_fast, + indexer_equal_fast, no_default, ) from pandas._typing import ( @@ -6724,7 +6724,7 @@ def sort_values( else: return self.copy(deep=None) - if array_equal_fast(indexer, np.arange(0, len(indexer), dtype=indexer.dtype)): + if indexer_equal_fast(indexer, len(indexer)): if inplace: return self._update_inplace(self) else: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ab9b76fbdf712..af28b8083d4ba 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -35,7 +35,7 @@ ) from pandas._libs import lib -from pandas._libs.lib import array_equal_fast +from pandas._libs.lib import indexer_equal_fast from pandas._libs.tslibs import ( Period, Tick, @@ -3780,10 +3780,7 @@ def _take( axis == 0 and indices.ndim == 1 and using_copy_on_write() - and array_equal_fast( - indices, - np.arange(0, len(self), dtype=np.intp), - ) + and indexer_equal_fast(indices, len(self)) ): return self.copy(deep=None) diff --git a/pandas/core/series.py b/pandas/core/series.py index 91f7095e59db5..eb64e84ffd1d2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -33,7 +33,7 @@ reshape, ) from pandas._libs.lib import ( - array_equal_fast, + indexer_equal_fast, no_default, ) from pandas._typing import ( @@ -891,7 +891,7 @@ def take(self, indices, axis: Axis = 0, **kwargs) -> Series: if ( indices.ndim == 1 and using_copy_on_write() - and array_equal_fast(indices, np.arange(0, len(self), dtype=indices.dtype)) + and indexer_equal_fast(indices, len(self)) ): return self.copy(deep=None) @@ -3566,9 +3566,7 @@ def sort_values( values_to_sort = ensure_key_mapped(self, key)._values if key else self._values sorted_index = nargsort(values_to_sort, kind, bool(ascending), na_position) - if array_equal_fast( - sorted_index, np.arange(0, len(sorted_index), dtype=sorted_index.dtype) - ): + if indexer_equal_fast(sorted_index, len(sorted_index)): if inplace: return self._update_inplace(self) return self.copy(deep=None) diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index e352250dc748d..0eef6a51ef1e1 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -244,25 +244,22 @@ def test_get_reverse_indexer(self): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("dtype", ["int64", "int32"]) - def test_array_equal_fast(self, dtype): + def test_indexer_equal_fast(self, dtype): # GH#50592 - left = np.arange(1, 100, dtype=dtype) - right = np.arange(1, 100, dtype=dtype) - assert lib.array_equal_fast(left, right) + left = np.arange(0, 100, dtype=dtype) + assert lib.indexer_equal_fast(left, 100) @pytest.mark.parametrize("dtype", ["int64", "int32"]) - def test_array_equal_fast_not_equal(self, dtype): + def test_indexer_equal_fast_not_equal(self, dtype): # GH#50592 left = np.array([1, 2], dtype=dtype) - right = np.array([2, 2], dtype=dtype) - assert not lib.array_equal_fast(left, right) + assert not lib.indexer_equal_fast(left, 2) @pytest.mark.parametrize("dtype", ["int64", "int32"]) - def test_array_equal_fast_not_equal_shape(self, dtype): + def test_indexer_equal_fast_not_equal_shape(self, dtype): # GH#50592 - left = np.array([1, 2, 3], dtype=dtype) - right = np.array([2, 2], dtype=dtype) - assert not lib.array_equal_fast(left, right) + left = np.array([0, 1, 2], dtype=dtype) + assert not lib.indexer_equal_fast(left, 2) def test_cache_readonly_preserve_docstrings(): From 48e163592f51839f1fbb679c8d3683d8fb9b3476 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 19 Jan 2023 21:23:50 +0100 Subject: [PATCH 2/2] Rename --- pandas/_libs/lib.pyi | 2 +- pandas/_libs/lib.pyx | 2 +- pandas/core/frame.py | 4 ++-- pandas/core/generic.py | 4 ++-- pandas/core/series.py | 6 +++--- pandas/tests/libs/test_lib.py | 12 ++++++------ 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 7df632313bf55..72b46d9e30684 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -240,6 +240,6 @@ def get_reverse_indexer( ) -> npt.NDArray[np.intp]: ... def is_bool_list(obj: list) -> bool: ... def dtypes_all_equal(types: list[DtypeObj]) -> bool: ... -def indexer_equal_fast( +def is_range_indexer( left: np.ndarray, n: int # np.ndarray[np.int64, ndim=1] ) -> bool: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 216105b32a069..16d5bbaad9de9 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -650,7 +650,7 @@ ctypedef fused int6432_t: @cython.wraparound(False) @cython.boundscheck(False) -def indexer_equal_fast(ndarray[int6432_t, ndim=1] left, int n) -> bool: +def is_range_indexer(ndarray[int6432_t, ndim=1] left, int n) -> bool: """ Perform an element by element comparison on 1-d integer arrays, meant for indexer comparisons diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 36195ea04815f..3b122eaa814e5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -48,7 +48,7 @@ from pandas._libs.hashtable import duplicated from pandas._libs.lib import ( NoDefault, - indexer_equal_fast, + is_range_indexer, no_default, ) from pandas._typing import ( @@ -6724,7 +6724,7 @@ def sort_values( else: return self.copy(deep=None) - if indexer_equal_fast(indexer, len(indexer)): + if is_range_indexer(indexer, len(indexer)): if inplace: return self._update_inplace(self) else: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index af28b8083d4ba..a91c46d7d06c4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -35,7 +35,7 @@ ) from pandas._libs import lib -from pandas._libs.lib import indexer_equal_fast +from pandas._libs.lib import is_range_indexer from pandas._libs.tslibs import ( Period, Tick, @@ -3780,7 +3780,7 @@ def _take( axis == 0 and indices.ndim == 1 and using_copy_on_write() - and indexer_equal_fast(indices, len(self)) + and is_range_indexer(indices, len(self)) ): return self.copy(deep=None) diff --git a/pandas/core/series.py b/pandas/core/series.py index eb64e84ffd1d2..2849b009cf72c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -33,7 +33,7 @@ reshape, ) from pandas._libs.lib import ( - indexer_equal_fast, + is_range_indexer, no_default, ) from pandas._typing import ( @@ -891,7 +891,7 @@ def take(self, indices, axis: Axis = 0, **kwargs) -> Series: if ( indices.ndim == 1 and using_copy_on_write() - and indexer_equal_fast(indices, len(self)) + and is_range_indexer(indices, len(self)) ): return self.copy(deep=None) @@ -3566,7 +3566,7 @@ def sort_values( values_to_sort = ensure_key_mapped(self, key)._values if key else self._values sorted_index = nargsort(values_to_sort, kind, bool(ascending), na_position) - if indexer_equal_fast(sorted_index, len(sorted_index)): + if is_range_indexer(sorted_index, len(sorted_index)): if inplace: return self._update_inplace(self) return self.copy(deep=None) diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 0eef6a51ef1e1..302dc21ec997c 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -244,22 +244,22 @@ def test_get_reverse_indexer(self): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("dtype", ["int64", "int32"]) - def test_indexer_equal_fast(self, dtype): + def test_is_range_indexer(self, dtype): # GH#50592 left = np.arange(0, 100, dtype=dtype) - assert lib.indexer_equal_fast(left, 100) + assert lib.is_range_indexer(left, 100) @pytest.mark.parametrize("dtype", ["int64", "int32"]) - def test_indexer_equal_fast_not_equal(self, dtype): + def test_is_range_indexer_not_equal(self, dtype): # GH#50592 left = np.array([1, 2], dtype=dtype) - assert not lib.indexer_equal_fast(left, 2) + assert not lib.is_range_indexer(left, 2) @pytest.mark.parametrize("dtype", ["int64", "int32"]) - def test_indexer_equal_fast_not_equal_shape(self, dtype): + def test_is_range_indexer_not_equal_shape(self, dtype): # GH#50592 left = np.array([0, 1, 2], dtype=dtype) - assert not lib.indexer_equal_fast(left, 2) + assert not lib.is_range_indexer(left, 2) def test_cache_readonly_preserve_docstrings():