From 416bdb9ce1b263c657b164f27f56f662099e7e30 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Apr 2024 10:40:02 -0700 Subject: [PATCH 1/7] Add RangeIndex.value_counts,searchsorted,to_numpy --- pandas/core/base.py | 2 -- pandas/core/frame.py | 2 +- pandas/core/indexes/base.py | 3 +- pandas/core/indexes/range.py | 63 ++++++++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 4 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 95b203590b393..8e68cb2087fcb 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -528,7 +528,6 @@ def array(self) -> ExtensionArray: """ raise AbstractMethodError(self) - @final def to_numpy( self, dtype: npt.DTypeLike | None = None, @@ -880,7 +879,6 @@ def _map_values(self, mapper, na_action=None): return algorithms.map_array(arr, mapper, na_action=na_action) - @final def value_counts( self, normalize: bool = False, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0185ca8241617..a623ad1bfbe45 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4012,7 +4012,6 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: return series._values[index] series = self._get_item(col) - engine = self.index._engine if not isinstance(self.index, MultiIndex): # CategoricalIndex: Trying to use the engine fastpath may give incorrect @@ -4023,6 +4022,7 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: # For MultiIndex going through engine effectively restricts us to # same-length tuples; see test_get_set_value_no_partial_indexing + engine = self.index._engine loc = engine.get_loc(index) return series._values[loc] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 69f916bb3f769..9036123f3cb67 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -832,7 +832,8 @@ def _reset_identity(self) -> None: @final def _cleanup(self) -> None: - self._engine.clear_mapping() + if "_engine" in self._cache: + self._engine.clear_mapping() @cache_readonly def _engine( diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 0ba3c22093c69..d36e504870548 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -57,9 +57,13 @@ Dtype, JoinHow, NaPosition, + NumpySorter, Self, npt, ) + + from pandas import Series + _empty_range = range(0) _dtype_int64 = np.dtype(np.int64) @@ -1359,3 +1363,62 @@ def take( # type: ignore[override] taken += self.start return self._shallow_copy(taken, name=self.name) + + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value: object = lib.no_default, + **kwargs, + ) -> np.ndarray: + return np.arange( + start=self.start, stop=self.stop, step=self.step, dtype=dtype or self.dtype + ) + + def value_counts( + self, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + bins=None, + dropna: bool = True, + ) -> Series: + from pandas import Series + + if bins is not None: + return super().value_counts( + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + dropna=dropna, + ) + data = np.ones(len(self), dtype=np.int64) + if normalize: + data /= len(self) + return Series(data, index=self.copy()) + + def searchsorted( + self, + value, + side: Literal["left", "right"] = "left", + sorter: NumpySorter | None = None, + ) -> npt.NDArray[np.intp] | np.intp: + if ( + side not in {"left", "right"} + or sorter is not None + or not is_float(value) + or not is_integer(value) + ): + return super().searchsorted(value=value, side=side, sorter=sorter) + if flip := (self.step < 0): + rng = self._range[::-1] + shift = int(side == "right") + else: + rng = self._range + shift = int(side == "left") + + offset = (value - rng.start - shift) // rng.step + 1 + if flip: + offset = len(self) - offset + return np.intp(max(min(len(self), offset), 0)) From 6a54314da750d839abe2209d3fc9487eb663bfed Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Apr 2024 11:10:49 -0700 Subject: [PATCH 2/7] Undo engine stuff --- pandas/core/frame.py | 2 +- pandas/core/indexes/base.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c7bda7275d469..567fcb1ef7c05 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4012,6 +4012,7 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: return series._values[index] series = self._get_item(col) + engine = self.index._engine if not isinstance(self.index, MultiIndex): # CategoricalIndex: Trying to use the engine fastpath may give incorrect @@ -4022,7 +4023,6 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: # For MultiIndex going through engine effectively restricts us to # same-length tuples; see test_get_set_value_no_partial_indexing - engine = self.index._engine loc = engine.get_loc(index) return series._values[loc] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d2129c54fabc4..63facb61ed498 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -832,8 +832,7 @@ def _reset_identity(self) -> None: @final def _cleanup(self) -> None: - if "_engine" in self._cache: - self._engine.clear_mapping() + self._engine.clear_mapping() @cache_readonly def _engine( From 6a656d7303fc408735266f29a4bd99beb128c7ac Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Apr 2024 12:18:44 -0700 Subject: [PATCH 3/7] Finish searchsorted, add wahtsnew --- doc/source/whatsnew/v3.0.0.rst | 3 ++ pandas/core/indexes/range.py | 41 +++++++++++------- pandas/tests/indexes/ranges/test_range.py | 52 +++++++++++++++++++++++ 3 files changed, 81 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c817e09b3b360..d11d4470e149b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -332,6 +332,9 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) - Performance improvement in :meth:`RangeIndex.argmin` and :meth:`RangeIndex.argmax` (:issue:`57823`) +- Performance improvement in :meth:`RangeIndex.to_numpy` (:issue:`?`) +- Performance improvement in :meth:`RangeIndex.value_counts` (:issue:`?`) +- Performance improvement in :meth:`RangeIndex.searchsorted` (:issue:`?`) - Performance improvement in :meth:`RangeIndex.insert` returning a :class:`RangeIndex` instead of a :class:`Index` when the :class:`RangeIndex` is empty. (:issue:`57833`) - Performance improvement in :meth:`RangeIndex.round` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57824`) - Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index d36e504870548..9bc66a13efd11 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1393,10 +1393,11 @@ def value_counts( bins=bins, dropna=dropna, ) + name = "proportion" if normalize else "count" data = np.ones(len(self), dtype=np.int64) if normalize: - data /= len(self) - return Series(data, index=self.copy()) + data = data / len(self) + return Series(data, index=self.copy(), name=name) def searchsorted( self, @@ -1404,21 +1405,31 @@ def searchsorted( side: Literal["left", "right"] = "left", sorter: NumpySorter | None = None, ) -> npt.NDArray[np.intp] | np.intp: - if ( - side not in {"left", "right"} - or sorter is not None - or not is_float(value) - or not is_integer(value) - ): + if side not in {"left", "right"} or sorter is not None: return super().searchsorted(value=value, side=side, sorter=sorter) + + was_scalar = False + if is_scalar(value): + was_scalar = True + array_value = np.array([value]) + else: + array_value = np.asarray(value) + if array_value.dtype.kind not in "iu": + return super().searchsorted(value=value, side=side, sorter=sorter) + if flip := (self.step < 0): rng = self._range[::-1] - shift = int(side == "right") + start = rng.start + step = rng.step + shift = side == "right" else: - rng = self._range - shift = int(side == "left") - - offset = (value - rng.start - shift) // rng.step + 1 + start = self.start + step = self.step + shift = side == "left" + result = (array_value - start - int(shift)) // step + 1 if flip: - offset = len(self) - offset - return np.intp(max(min(len(self), offset), 0)) + result = len(self) - result + result = np.maximum(np.minimum(result, len(self)), 0) + if was_scalar: + return np.intp(result.item()) + return result.astype(np.intp, copy=False) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 727edb7ae30ad..a0e3a46424c06 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -874,3 +874,55 @@ def test_getitem_integers_return_index(): result = RangeIndex(0, 10, 2, name="foo")[[0, 1, -1]] expected = Index([0, 2, 8], dtype="int64", name="foo") tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("copy", [True, False]) +@pytest.mark.parametrize( + "rng", + [ + range(3), + range(0), + range(0, 3, 2), + range(3, -3, -2), + ], +) +def test_to_numpy(copy, rng, any_int_numpy_dtype): + ri = RangeIndex(rng) + result = ri.to_numpy(dtype=any_int_numpy_dtype, copy=copy) + expected = np.arange( + start=rng.start, stop=rng.stop, step=rng.step, dtype=any_int_numpy_dtype + ) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize( + "rng", + [ + range(3), + range(0), + range(0, 3, 2), + range(3, -3, -2), + ], +) +def test_value_counts(sort, dropna, ascending, normalize, rng): + ri = RangeIndex(rng, name="A") + result = ri.value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + expected = Index(list(rng), name="A").value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + tm.assert_series_equal(result, expected, check_index_type=False) + + +@pytest.mark.parametrize("side", ["left", "right"]) +@pytest.mark.parametrize("value", [0, -5, 5, -3, np.array([-5, -3, 0, 5])]) +def test_searchsorted(side, value): + ri = RangeIndex(-3, 3, 2) + result = ri.searchsorted(value=value, side=side) + expected = Index(list(ri)).searchsorted(value=value, side=side) + if isinstance(value, int): + assert result == expected + else: + tm.assert_numpy_array_equal(result, expected) From 466360361ff767e821a9a2826075fbd843468d7a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Apr 2024 13:13:47 -0700 Subject: [PATCH 4/7] Remove old to_numpy implementation --- doc/source/whatsnew/v3.0.0.rst | 1 - pandas/core/base.py | 2 +- pandas/core/indexes/range.py | 11 ----------- pandas/tests/indexes/ranges/test_range.py | 19 ------------------- 4 files changed, 1 insertion(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d11d4470e149b..56321b9a41b6c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -332,7 +332,6 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) - Performance improvement in :meth:`RangeIndex.argmin` and :meth:`RangeIndex.argmax` (:issue:`57823`) -- Performance improvement in :meth:`RangeIndex.to_numpy` (:issue:`?`) - Performance improvement in :meth:`RangeIndex.value_counts` (:issue:`?`) - Performance improvement in :meth:`RangeIndex.searchsorted` (:issue:`?`) - Performance improvement in :meth:`RangeIndex.insert` returning a :class:`RangeIndex` instead of a :class:`Index` when the :class:`RangeIndex` is empty. (:issue:`57833`) diff --git a/pandas/core/base.py b/pandas/core/base.py index 817e1676c13ec..bda3058ca663c 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -658,7 +658,7 @@ def to_numpy( ) values = self._values - if fillna: + if fillna and self.hasnans: if not can_hold_element(values, na_value): # if we can't hold the na_value asarray either makes a copy or we # error before modifying values. The asarray later on thus won't make diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 9bc66a13efd11..b88e32c197b74 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1364,17 +1364,6 @@ def take( # type: ignore[override] return self._shallow_copy(taken, name=self.name) - def to_numpy( - self, - dtype: npt.DTypeLike | None = None, - copy: bool = False, - na_value: object = lib.no_default, - **kwargs, - ) -> np.ndarray: - return np.arange( - start=self.start, stop=self.stop, step=self.step, dtype=dtype or self.dtype - ) - def value_counts( self, normalize: bool = False, diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index a0e3a46424c06..1f9df30d60c11 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -876,25 +876,6 @@ def test_getitem_integers_return_index(): tm.assert_index_equal(result, expected) -@pytest.mark.parametrize("copy", [True, False]) -@pytest.mark.parametrize( - "rng", - [ - range(3), - range(0), - range(0, 3, 2), - range(3, -3, -2), - ], -) -def test_to_numpy(copy, rng, any_int_numpy_dtype): - ri = RangeIndex(rng) - result = ri.to_numpy(dtype=any_int_numpy_dtype, copy=copy) - expected = np.arange( - start=rng.start, stop=rng.stop, step=rng.step, dtype=any_int_numpy_dtype - ) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( "rng", From ced1861a689b16b81a8baddb11af5c2cc474e6f0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Apr 2024 13:15:42 -0700 Subject: [PATCH 5/7] Add whatsnew for to_numpy --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 56321b9a41b6c..4ab223a59d811 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -332,6 +332,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) - Performance improvement in :meth:`RangeIndex.argmin` and :meth:`RangeIndex.argmax` (:issue:`57823`) +- Performance improvement in :meth:`RangeIndex.to_numpy` when specifying an ``na_value`` (:issue:`?`) - Performance improvement in :meth:`RangeIndex.value_counts` (:issue:`?`) - Performance improvement in :meth:`RangeIndex.searchsorted` (:issue:`?`) - Performance improvement in :meth:`RangeIndex.insert` returning a :class:`RangeIndex` instead of a :class:`Index` when the :class:`RangeIndex` is empty. (:issue:`57833`) From 350a7bab2e07caa2b3081a26c79a71425cb69ffb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Apr 2024 13:24:17 -0700 Subject: [PATCH 6/7] add whatsnew number --- doc/source/whatsnew/v3.0.0.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4ab223a59d811..84c30e5e9d0b0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -332,11 +332,11 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) - Performance improvement in :meth:`RangeIndex.argmin` and :meth:`RangeIndex.argmax` (:issue:`57823`) -- Performance improvement in :meth:`RangeIndex.to_numpy` when specifying an ``na_value`` (:issue:`?`) -- Performance improvement in :meth:`RangeIndex.value_counts` (:issue:`?`) -- Performance improvement in :meth:`RangeIndex.searchsorted` (:issue:`?`) - Performance improvement in :meth:`RangeIndex.insert` returning a :class:`RangeIndex` instead of a :class:`Index` when the :class:`RangeIndex` is empty. (:issue:`57833`) - Performance improvement in :meth:`RangeIndex.round` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57824`) +- Performance improvement in :meth:`RangeIndex.searchsorted` (:issue:`58376`) +- Performance improvement in :meth:`RangeIndex.to_numpy` when specifying an ``na_value`` (:issue:`58376`) +- Performance improvement in :meth:`RangeIndex.value_counts` (:issue:`58376`) - Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) From 6005d9cad3932e29bc8433637213d06bf28aa64b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Apr 2024 13:58:10 -0700 Subject: [PATCH 7/7] Fix typing --- pandas/core/indexes/range.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index b88e32c197b74..bd9e8b84fd82a 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1383,12 +1383,14 @@ def value_counts( dropna=dropna, ) name = "proportion" if normalize else "count" - data = np.ones(len(self), dtype=np.int64) + data: npt.NDArray[np.floating] | npt.NDArray[np.signedinteger] = np.ones( + len(self), dtype=np.int64 + ) if normalize: data = data / len(self) return Series(data, index=self.copy(), name=name) - def searchsorted( + def searchsorted( # type: ignore[override] self, value, side: Literal["left", "right"] = "left",