From 9b18b6bec51fa0c475a46e50d81f9bd53a6eba63 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Mar 2024 11:31:42 -0700 Subject: [PATCH 1/9] PERF: Avoid np.divmod in RangeIndex._shallow_copy --- pandas/_libs/lib.pyx | 6 +++--- pandas/core/indexes/range.py | 13 +++++-------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 00668576d5d53..4a692de802804 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -659,7 +659,7 @@ ctypedef fused int6432_t: @cython.wraparound(False) @cython.boundscheck(False) -def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n) -> bool: +def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n, int diff=1) -> bool: """ Perform an element by element comparison on 1-d integer arrays, meant for indexer comparisons @@ -667,12 +667,12 @@ def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n) -> bool: cdef: Py_ssize_t i - if left.size != n: + if left.size != n or diff == 0: return False for i in range(n): - if left[i] != i: + if left[i] != left[0] + i * diff: return False return True diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 24f53f16e1985..13af2de28fe13 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -476,14 +476,11 @@ def _shallow_copy(self, values, name: Hashable = no_default): # GH 46675 & 43885: If values is equally spaced, return a # more memory-compact RangeIndex instead of Index with 64-bit dtype diff = values[1] - values[0] - if not missing.isna(diff) and diff != 0: - maybe_range_indexer, remainder = np.divmod(values - values[0], diff) - if ( - lib.is_range_indexer(maybe_range_indexer, len(maybe_range_indexer)) - and not remainder.any() - ): - new_range = range(values[0], values[-1] + diff, diff) - return type(self)._simple_new(new_range, name=name) + if not missing.isna(diff) and lib.is_range_indexer( + values, len(values), diff + ): + new_range = range(values[0], values[-1] + diff, diff) + return type(self)._simple_new(new_range, name=name) return self._constructor._simple_new(values, name=name) def _view(self) -> Self: From 9bcc4eacb6ba7d1adeafbb331fab753a5eb0616f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Mar 2024 12:48:36 -0700 Subject: [PATCH 2/9] Make is_range --- pandas/_libs/lib.pyi | 4 ++++ pandas/_libs/lib.pyx | 26 +++++++++++++++++++++++--- pandas/core/indexes/range.py | 4 +--- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 34193a9b1d231..d7ba16d5e9d69 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -231,3 +231,7 @@ def is_range_indexer( left: np.ndarray, n: int, # np.ndarray[np.int64, ndim=1] ) -> bool: ... +def is_range( + left: np.ndarray, + diff: int, # np.ndarray[np.int64, ndim=1] +) -> bool: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4a692de802804..8412d3b4fd0e7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -659,7 +659,7 @@ ctypedef fused int6432_t: @cython.wraparound(False) @cython.boundscheck(False) -def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n, int diff=1) -> bool: +def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n) -> bool: """ Perform an element by element comparison on 1-d integer arrays, meant for indexer comparisons @@ -667,12 +667,32 @@ def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n, int diff=1) cdef: Py_ssize_t i - if left.size != n or diff == 0: + if left.size != n: return False for i in range(n): - if left[i] != left[0] + i * diff: + if left[i] != i: + return False + + return True + + +@cython.wraparound(False) +@cython.boundscheck(False) +def is_range(ndarray[int6432_t, ndim=1] sequence, int64_t diff) -> bool: + """ + Check if sequence is equivalent to a range with the specified diff. + """ + cdef: + Py_ssize_t i, n = len(sequence) + + if diff == 0: + return False + + for i in range(n): + + if sequence[i] != sequence[0] + i * diff: return False return True diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 13af2de28fe13..a66cfabaee171 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -476,9 +476,7 @@ def _shallow_copy(self, values, name: Hashable = no_default): # GH 46675 & 43885: If values is equally spaced, return a # more memory-compact RangeIndex instead of Index with 64-bit dtype diff = values[1] - values[0] - if not missing.isna(diff) and lib.is_range_indexer( - values, len(values), diff - ): + if not missing.isna(diff) and lib.is_range(values, diff): new_range = range(values[0], values[-1] + diff, diff) return type(self)._simple_new(new_range, name=name) return self._constructor._simple_new(values, name=name) From c9339d56c662f384e6be5fae43dec198ad8f263f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Mar 2024 12:50:17 -0700 Subject: [PATCH 3/9] pyi error --- pandas/_libs/lib.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index d7ba16d5e9d69..3e14bf12c43f8 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -232,6 +232,6 @@ def is_range_indexer( n: int, # np.ndarray[np.int64, ndim=1] ) -> bool: ... def is_range( - left: np.ndarray, + sequence: np.ndarray, diff: int, # np.ndarray[np.int64, ndim=1] ) -> bool: ... From 0b484bdb270168422dd678ad08171257ac10d186 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Mar 2024 10:18:07 -0700 Subject: [PATCH 4/9] Use step --- pandas/_libs/lib.pyi | 4 ++-- pandas/_libs/lib.pyx | 8 ++++---- pandas/core/indexes/range.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 3e14bf12c43f8..b39d32d069619 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -231,7 +231,7 @@ def is_range_indexer( left: np.ndarray, n: int, # np.ndarray[np.int64, ndim=1] ) -> bool: ... -def is_range( +def is_sequence_range( sequence: np.ndarray, - diff: int, # np.ndarray[np.int64, ndim=1] + step: int, # np.ndarray[np.int64, ndim=1] ) -> bool: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 8412d3b4fd0e7..0780c4f0c454c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -680,19 +680,19 @@ def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n) -> bool: @cython.wraparound(False) @cython.boundscheck(False) -def is_range(ndarray[int6432_t, ndim=1] sequence, int64_t diff) -> bool: +def is_sequence_range(ndarray[intp_t, ndim=1] sequence, int64_t step) -> bool: """ - Check if sequence is equivalent to a range with the specified diff. + Check if sequence is equivalent to a range with the specified step. """ cdef: Py_ssize_t i, n = len(sequence) - if diff == 0: + if step == 0: return False for i in range(n): - if sequence[i] != sequence[0] + i * diff: + if sequence[i] != sequence[0] + i * step: return False return True diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 64c855347dc7a..2e7c3a1ad6e01 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -482,7 +482,7 @@ def _shallow_copy(self, values, name: Hashable = no_default): new_range = range(start, start + self.step, self.step) return type(self)._simple_new(new_range, name=name) diff = values[1] - values[0] - if not missing.isna(diff) and lib.is_range(values, diff): + if not missing.isna(diff) and lib.is_sequence_range(values, diff): new_range = range(values[0], values[-1] + diff, diff) return type(self)._simple_new(new_range, name=name) return self._constructor._simple_new(values, name=name) From 997416f848e13318e63cdfc787d209982be40746 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Mar 2024 12:38:11 -0700 Subject: [PATCH 5/9] Switch back to int6432 --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0780c4f0c454c..5196148cfd468 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -680,7 +680,7 @@ def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n) -> bool: @cython.wraparound(False) @cython.boundscheck(False) -def is_sequence_range(ndarray[intp_t, ndim=1] sequence, int64_t step) -> bool: +def is_sequence_range(ndarray[int6432_t, ndim=1] sequence, int64_t step) -> bool: """ Check if sequence is equivalent to a range with the specified step. """ From b8ea98ca75b06fb072d55b4a25d619f9c03a837e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Mar 2024 09:26:04 -0700 Subject: [PATCH 6/9] try int64_t --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5196148cfd468..bc2a2e5f99e53 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -680,7 +680,7 @@ def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n) -> bool: @cython.wraparound(False) @cython.boundscheck(False) -def is_sequence_range(ndarray[int6432_t, ndim=1] sequence, int64_t step) -> bool: +def is_sequence_range(ndarray[int64_t, ndim=1] sequence, int64_t step) -> bool: """ Check if sequence is equivalent to a range with the specified step. """ From 21f13d9d9a80e7b63489b061ee7880783b0f465b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Mar 2024 11:34:12 -0700 Subject: [PATCH 7/9] Revert "try int64_t" This reverts commit b8ea98ca75b06fb072d55b4a25d619f9c03a837e. --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index bc2a2e5f99e53..5196148cfd468 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -680,7 +680,7 @@ def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n) -> bool: @cython.wraparound(False) @cython.boundscheck(False) -def is_sequence_range(ndarray[int64_t, ndim=1] sequence, int64_t step) -> bool: +def is_sequence_range(ndarray[int6432_t, ndim=1] sequence, int64_t step) -> bool: """ Check if sequence is equivalent to a range with the specified step. """ From 503d96f061dd44cf54f029048ea96b81cbda88cb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 19 Mar 2024 15:58:26 -0700 Subject: [PATCH 8/9] Adjust maybe_sequence_to_range --- pandas/core/indexes/base.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 111263f588439..8185e5abde321 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7179,13 +7179,7 @@ def maybe_sequence_to_range(sequence) -> Any | range: diff = np_sequence[1] - np_sequence[0] if diff == 0: return sequence - elif len(np_sequence) == 2: - return range(np_sequence[0], np_sequence[1] + diff, diff) - maybe_range_indexer, remainder = np.divmod(np_sequence - np_sequence[0], diff) - if ( - lib.is_range_indexer(maybe_range_indexer, len(maybe_range_indexer)) - and not remainder.any() - ): + elif len(np_sequence) == 2 or lib.is_sequence_range(np_sequence, diff): return range(np_sequence[0], np_sequence[-1] + diff, diff) else: return sequence From 5f31dd85273f1a8329d00746b4332a5c61a984b4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 21 Mar 2024 09:14:34 -0700 Subject: [PATCH 9/9] Access first element once --- pandas/_libs/lib.pyx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5196148cfd468..a2205454a5a46 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -686,15 +686,17 @@ def is_sequence_range(ndarray[int6432_t, ndim=1] sequence, int64_t step) -> bool """ cdef: Py_ssize_t i, n = len(sequence) + int6432_t first_element if step == 0: return False + if n == 0: + return True - for i in range(n): - - if sequence[i] != sequence[0] + i * step: + first_element = sequence[0] + for i in range(1, n): + if sequence[i] != first_element + i * step: return False - return True