diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 896a20bae2069..78fe2ae966896 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -94,6 +94,12 @@ def time_min(self): def time_min_trivial(self): self.idx_inc.min() + def time_get_loc_inc(self): + self.idx_inc.get_loc(900000) + + def time_get_loc_dec(self): + self.idx_dec.get_loc(100000) + class IndexAppend: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a62cac7a94bbd..1bd5d1ea9d922 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -393,6 +393,7 @@ Performance Improvements - Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) - Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`) +- Improved performance when slicing :class:`RangeIndex` (:issue:`26565`) - Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`) - Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`) - Improved performance of :meth:`IntervalIndex.is_monotonic`, :meth:`IntervalIndex.is_monotonic_increasing` and :meth:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index ea14a4c789cd3..9401de3346ccd 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -22,6 +22,8 @@ from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.numeric import Int64Index +from pandas.io.formats.printing import pprint_thing + class RangeIndex(Int64Index): """ @@ -64,6 +66,8 @@ class RangeIndex(Int64Index): _typ = 'rangeindex' _engine_type = libindex.Int64Engine + # check whether self._data has benn called + _cached_data = None # type: np.ndarray # -------------------------------------------------------------------- # Constructors @@ -164,6 +168,8 @@ def _simple_new(cls, start, stop=None, step=None, name=None, for k, v in kwargs.items(): setattr(result, k, v) + result._range = range(result._start, result._stop, result._step) + result._reset_identity() return result @@ -180,9 +186,19 @@ def _constructor(self): """ return the class to use for construction """ return Int64Index - @cache_readonly + @property def _data(self): - return np.arange(self._start, self._stop, self._step, dtype=np.int64) + """ + An int array that for performance reasons is created only when needed. + + The constructed array is saved in ``_cached_data``. This allows us to + check if the array has been created without accessing ``_data`` and + triggering the construction. + """ + if self._cached_data is None: + self._cached_data = np.arange(self._start, self._stop, self._step, + dtype=np.int64) + return self._cached_data @cache_readonly def _int64index(self): @@ -215,6 +231,9 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None + def _format_with_header(self, header, na_rep='NaN', **kwargs): + return header + list(map(pprint_thing, self._range)) + # -------------------------------------------------------------------- @property def start(self): @@ -296,6 +315,15 @@ def is_monotonic_decreasing(self): def has_duplicates(self): return False + @Appender(_index_shared_docs['get_loc']) + def get_loc(self, key, method=None, tolerance=None): + if is_integer(key) and method is None and tolerance is None: + try: + return self._range.index(key) + except ValueError: + raise KeyError(key) + return super().get_loc(key, method=method, tolerance=tolerance) + def tolist(self): return list(range(self._start, self._stop, self._step)) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index b2c330015081c..477a4e527f278 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -241,6 +241,42 @@ def test_view(self): def test_dtype(self): assert self.index.dtype == np.int64 + def test_cached_data(self): + # GH 26565 + # Calling RangeIndex._data caches an int64 array of the same length at + # self._cached_data. This tests whether _cached_data has been set. + idx = RangeIndex(0, 100, 10) + + assert idx._cached_data is None + + repr(idx) + assert idx._cached_data is None + + str(idx) + assert idx._cached_data is None + + idx.get_loc(20) + assert idx._cached_data is None + + df = pd.DataFrame({'a': range(10)}, index=idx) + + df.loc[50] + assert idx._cached_data is None + + with pytest.raises(KeyError): + df.loc[51] + assert idx._cached_data is None + + df.loc[10:50] + assert idx._cached_data is None + + df.iloc[5:10] + assert idx._cached_data is None + + # actually calling data._data + assert isinstance(idx._data, np.ndarray) + assert isinstance(idx._cached_data, np.ndarray) + def test_is_monotonic(self): assert self.index.is_monotonic is True assert self.index.is_monotonic_increasing is True