From 5d4eb5e9f771062301ed6ad0ec34677c188374a9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 18 Mar 2024 14:33:27 -0700 Subject: [PATCH 1/4] REF: Avoid new objects when reverse slicing when possible --- pandas/core/arrays/datetimelike.py | 9 ++--- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/multi.py | 2 +- pandas/core/indexes/range.py | 54 +++++++++++++++--------------- pandas/core/indexing.py | 2 +- pandas/core/internals/managers.py | 6 ++-- pandas/core/reshape/reshape.py | 5 +-- pandas/core/series.py | 4 +-- pandas/core/sorting.py | 11 +++--- 9 files changed, 50 insertions(+), 45 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ba2c936b75d9e..745774b34a3ad 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2371,11 +2371,12 @@ def factorize( ): if self.freq is not None: # We must be unique, so can short-circuit (and retain freq) - codes = np.arange(len(self), dtype=np.intp) - uniques = self.copy() # TODO: copy or view? if sort and self.freq.n < 0: - codes = codes[::-1] - uniques = uniques[::-1] + codes = np.arange(len(self) - 1, -1, -1, dtype=np.intp) + uniques = self[::-1] + else: + codes = np.arange(len(self), dtype=np.intp) + uniques = self.copy() # TODO: copy or view? return codes, uniques if sort: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c2df773326dc9..1357ba4af52c4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2116,7 +2116,7 @@ def droplevel(self, level: IndexLabel = 0): if not isinstance(level, (tuple, list)): level = [level] - levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] + levnums = sorted((self._get_level_number(lev) for lev in level), reverse=True) return self._drop_level_numbers(levnums) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2cb05dadd5981..2e554bc848ffe 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3589,7 +3589,7 @@ def _reorder_indexer( new_order = key_order_map[self.codes[i][indexer]] elif isinstance(k, slice) and k.step is not None and k.step < 0: # flip order for negative step - new_order = np.arange(n)[::-1][indexer] + new_order = np.arange(n - 1, -1, -1)[indexer] elif isinstance(k, slice) and k.start is None and k.stop is None: # slice(None) should not determine order GH#31330 new_order = np.ones((n,), dtype=np.intp)[indexer] diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 82bf8d7c70c7e..84c426b4cfa77 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -65,6 +65,12 @@ _dtype_int64 = np.dtype(np.int64) +def min_fitting_element(start: int, step: int, lower_limit: int) -> int: + """Returns the smallest element greater than or equal to the limit""" + no_steps = -(-(lower_limit - start) // abs(step)) + return start + abs(step) * no_steps + + class RangeIndex(Index): """ Immutable Index implementing a monotonic integer range. @@ -586,25 +592,30 @@ def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: kwargs.pop("kind", None) # e.g. "mergesort" is irrelevant nv.validate_argsort(args, kwargs) + start, stop, step = None, None, None if self._range.step > 0: - result = np.arange(len(self), dtype=np.intp) + if ascending: + start = len(self) + else: + start, stop, step = len(self) - 1, -1, -1 + elif ascending: + start, stop, step = len(self) - 1, -1, -1 else: - result = np.arange(len(self) - 1, -1, -1, dtype=np.intp) + start = len(self) - if not ascending: - result = result[::-1] - return result + return np.arange(start, stop, step, dtype=np.intp) def factorize( self, sort: bool = False, use_na_sentinel: bool = True, ) -> tuple[npt.NDArray[np.intp], RangeIndex]: - codes = np.arange(len(self), dtype=np.intp) - uniques = self if sort and self.step < 0: - codes = codes[::-1] - uniques = uniques[::-1] + codes = np.arange(len(self) - 1, -1, -1, dtype=np.intp) + uniques = self[::-1] + else: + codes = np.arange(len(self), dtype=np.intp) + uniques = self return codes, uniques def equals(self, other: object) -> bool: @@ -715,26 +726,15 @@ def _intersection(self, other: Index, sort: bool = False): # intersection disregarding the lower bounds tmp_start = first.start + (second.start - first.start) * first.step // gcd * s new_step = first.step * second.step // gcd - new_range = range(tmp_start, int_high, new_step) - new_index = self._simple_new(new_range) # adjust index to limiting interval - new_start = new_index._min_fitting_element(int_low) - new_range = range(new_start, new_index.stop, new_index.step) - new_index = self._simple_new(new_range) + new_start = min_fitting_element(tmp_start, new_step, int_low) + new_range = range(new_start, int_high, new_step) - if (self.step < 0 and other.step < 0) is not (new_index.step < 0): - new_index = new_index[::-1] + if (self.step < 0 and other.step < 0) is not (new_range.step < 0): + new_range = new_range[::-1] - if sort is None: - new_index = new_index.sort_values() - - return new_index - - def _min_fitting_element(self, lower_limit: int) -> int: - """Returns the smallest element greater than or equal to the limit""" - no_steps = -(-(lower_limit - self.start) // abs(self.step)) - return self.start + abs(self.step) * no_steps + return self._simple_new(new_range) def _extended_gcd(self, a: int, b: int) -> tuple[int, int, int]: """ @@ -920,9 +920,9 @@ def _difference(self, other, sort=None): # e.g. range(10) and range(0, 10, 3) return super()._difference(other, sort=sort) - new_index = type(self)._simple_new(new_rng, name=res_name) if first is not self._range: - new_index = new_index[::-1] + new_rng = new_rng[::-1] + new_index = type(self)._simple_new(new_rng, name=res_name) return new_index diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c7a938dbc4449..c8a2e11dce3d7 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1145,7 +1145,7 @@ def _contains_slice(x: object) -> bool: # GH#41369 Loop in reverse order ensures indexing along columns before rows # which selects only necessary blocks which avoids dtype conversion if possible axis = len(tup) - 1 - for key in tup[::-1]: + for key in reversed(tup): if com.is_null_slice(key): axis -= 1 continue diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d920ebc60de8c..94548c3128afc 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1549,9 +1549,9 @@ def _insert_update_blklocs_and_blknos(self, loc) -> None: self._blklocs = np.append(self._blklocs, 0) self._blknos = np.append(self._blknos, len(self.blocks)) elif loc == 0: - # np.append is a lot faster, let's use it if we can. - self._blklocs = np.append(self._blklocs[::-1], 0)[::-1] - self._blknos = np.append(self._blknos[::-1], len(self.blocks))[::-1] + # As of numpy 1.26.4, np.concatenate faster than np.append + self._blklocs = np.concatenate(([0], self._blklocs)) + self._blknos = np.concatenate(([len(self.blocks)], self._blknos)) else: new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos( self.blklocs, self.blknos, loc, len(self.blocks) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index b28010c13d6dd..ff358e8ba346c 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -910,9 +910,10 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: raise ValueError("Columns with duplicate values are not supported in stack") # If we need to drop `level` from columns, it needs to be in descending order + set_levels = set(level) drop_levnums = sorted(level, reverse=True) stack_cols = frame.columns._drop_level_numbers( - [k for k in range(frame.columns.nlevels) if k not in level][::-1] + [k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels] ) if len(level) > 1: # Arrange columns in the order we want to take them, e.g. level=[2, 0, 1] @@ -936,7 +937,7 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: idx = (idx,) gen = iter(idx) column_indexer = tuple( - next(gen) if k in level else slice(None) + next(gen) if k in set_levels else slice(None) for k in range(frame.columns.nlevels) ) data = frame.loc[:, column_indexer] diff --git a/pandas/core/series.py b/pandas/core/series.py index 699ff413efb91..18f8ba2f2f85b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5510,9 +5510,9 @@ def case_when( replacements = updated_replacements default = default.astype(common_dtype) - counter = reversed(range(len(conditions))) + counter = range(len(conditions) - 1, -1, -1) for position, condition, replacement in zip( - counter, conditions[::-1], replacements[::-1] + counter, reversed(conditions), reversed(replacements) ): try: default = default.mask( diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 4774b013fc428..493e856c6dcc6 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -2,6 +2,7 @@ from __future__ import annotations +import itertools from typing import ( TYPE_CHECKING, Callable, @@ -334,13 +335,15 @@ def lexsort_indexer( raise ValueError(f"invalid na_position: {na_position}") if isinstance(orders, bool): - orders = [orders] * len(keys) + orders = itertools.repeat(orders, len(keys)) elif orders is None: - orders = [True] * len(keys) + orders = itertools.repeat(True, len(keys)) + else: + orders = reversed(orders) labels = [] - for k, order in zip(keys, orders): + for k, order in zip(reversed(keys), orders): k = ensure_key_mapped(k, key) if codes_given: codes = cast(np.ndarray, k) @@ -361,7 +364,7 @@ def lexsort_indexer( labels.append(codes) - return np.lexsort(labels[::-1]) + return np.lexsort(labels) def nargsort( From b1d582eacb0717885b050664705907e4bb229c28 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 18 Mar 2024 17:18:14 -0700 Subject: [PATCH 2/4] Adjust test --- pandas/tests/indexes/ranges/test_range.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 72762db21b0c5..c9ddbf4464b29 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -9,6 +9,7 @@ RangeIndex, ) import pandas._testing as tm +from pandas.core.indexes.range import min_fitting_element class TestRangeIndex: @@ -419,21 +420,21 @@ def test_extended_gcd(self, simple_index): assert 2 == result[0] def test_min_fitting_element(self): - result = RangeIndex(0, 20, 2)._min_fitting_element(1) + result = min_fitting_element(0, 2, 1) assert 2 == result - result = RangeIndex(1, 6)._min_fitting_element(1) + result = min_fitting_element(1, 1, 1) assert 1 == result - result = RangeIndex(18, -2, -2)._min_fitting_element(1) + result = min_fitting_element(18, -2, 1) assert 2 == result - result = RangeIndex(5, 0, -1)._min_fitting_element(1) + result = min_fitting_element(5, -1, 1) assert 1 == result big_num = 500000000000000000000000 - result = RangeIndex(5, big_num * 2, 1)._min_fitting_element(big_num) + result = min_fitting_element(5, 1, big_num) assert big_num == result def test_slice_specialised(self, simple_index): From af63cd6104d13837b6f18fb88b3e84d41a607011 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 18 Mar 2024 18:12:18 -0700 Subject: [PATCH 3/4] Remove astypes --- pandas/tests/indexes/ranges/test_setops.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py index d417b8b743dc5..ac24ff828cb8f 100644 --- a/pandas/tests/indexes/ranges/test_setops.py +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -93,12 +93,12 @@ def test_intersection(self, sort): # GH 17296: intersect two decreasing RangeIndexes first = RangeIndex(10, -2, -2) other = RangeIndex(5, -4, -1) - expected = first.astype(int).intersection(other.astype(int), sort=sort) - result = first.intersection(other, sort=sort).astype(int) + expected = RangeIndex(start=4, stop=-2, step=-2) + result = first.intersection(other, sort=sort) tm.assert_index_equal(result, expected) # reversed - result = other.intersection(first, sort=sort).astype(int) + result = other.intersection(first, sort=sort) tm.assert_index_equal(result, expected) index = RangeIndex(5, name="foo") From 01a9cd80b229e9348db2c80c4b221d6681d38ead Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 18 Mar 2024 18:15:52 -0700 Subject: [PATCH 4/4] Fix typing --- pandas/core/internals/managers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 94548c3128afc..af851e1fc8224 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1550,8 +1550,8 @@ def _insert_update_blklocs_and_blknos(self, loc) -> None: self._blknos = np.append(self._blknos, len(self.blocks)) elif loc == 0: # As of numpy 1.26.4, np.concatenate faster than np.append - self._blklocs = np.concatenate(([0], self._blklocs)) - self._blknos = np.concatenate(([len(self.blocks)], self._blknos)) + self._blklocs = np.concatenate([[0], self._blklocs]) + self._blknos = np.concatenate([[len(self.blocks)], self._blknos]) else: new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos( self.blklocs, self.blknos, loc, len(self.blocks)