diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 933946b1ca1ac..8097118a79d20 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pandas.util.testing as tm +import warnings try: from pandas.api.types import union_categoricals @@ -122,11 +123,16 @@ def setup(self): ncats = 100 self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) - self.s_str_cat = self.s_str.astype("category") - self.s_str_cat_ordered = self.s_str_cat.cat.as_ordered() + self.s_str_cat = pd.Series(self.s_str, dtype="category") + with warnings.catch_warnings(record=True): + str_cat_type = pd.CategoricalDtype(set(self.s_str), ordered=True) + self.s_str_cat_ordered = self.s_str.astype(str_cat_type) + self.s_int = pd.Series(np.random.randint(0, ncats, size=N)) - self.s_int_cat = self.s_int.astype("category") - self.s_int_cat_ordered = self.s_int_cat.cat.as_ordered() + self.s_int_cat = pd.Series(self.s_int, dtype="category") + with warnings.catch_warnings(record=True): + int_cat_type = pd.CategoricalDtype(set(self.s_int), ordered=True) + self.s_int_cat_ordered = self.s_int.astype(int_cat_type) def time_rank_string(self): self.s_str.rank() diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 489e5c4cd63ea..eb730f91b10b3 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -15,6 +15,7 @@ concat, date_range, option_context, + period_range, ) @@ -93,22 +94,30 @@ def time_loc_slice(self, index, index_structure): class NonNumericSeriesIndexing: params = [ - ("string", "datetime"), - ("unique_monotonic_inc", "nonunique_monotonic_inc"), + ("string", "datetime", "period"), + ("unique_monotonic_inc", "nonunique_monotonic_inc", "non_monotonic"), ] param_names = ["index_dtype", "index_structure"] def setup(self, index, index_structure): N = 10 ** 6 - indexes = { - "string": tm.makeStringIndex(N), - "datetime": date_range("1900", periods=N, freq="s"), - } - index = indexes[index] + if index == "string": + index = tm.makeStringIndex(N) + elif index == "datetime": + index = date_range("1900", periods=N, freq="s") + elif index == "period": + index = period_range("1900", periods=N, freq="s") + index = index.sort_values() + assert index.is_unique and index.is_monotonic_increasing if index_structure == "nonunique_monotonic_inc": index = index.insert(item=index[2], loc=2)[:-1] + elif index_structure == "non_monotonic": + index = index[::2].append(index[1::2]) + assert len(index) == N self.s = Series(np.random.rand(N), index=index) self.lbl = index[80000] + # warm up index mapping + self.s[self.lbl] def time_getitem_label_slice(self, index, index_structure): self.s[: self.lbl] diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 241e445bf6686..b0bbb72c9a428 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -938,6 +938,8 @@ Performance improvements - Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`) - Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`) - For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`) +- Improved performance of :class:`DatetimeIndex` and :class:`PeriodIndex` slicing given non-unique, monotonic data (:issue:`27136`). + .. _whatsnew_0250.bug_fixes: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d3837617d231a..96ce408a0ff8c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4792,7 +4792,6 @@ def get_indexer_non_unique(self, target): return pself.get_indexer_non_unique(ptarget) if self.is_all_dates: - self = Index(self.asi8) tgt_values = target.asi8 else: tgt_values = target._ndarray_values diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 0013df44614e8..47cf0f26f9ca5 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -9,6 +9,7 @@ from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.common import ( + ensure_platform_int, is_bool_dtype, is_datetime64_any_dtype, is_float, @@ -618,7 +619,7 @@ def get_value(self, series, key): elif grp == freqn: key = Period(asdt, freq=self.freq).ordinal return com.maybe_box( - self, self._engine.get_value(s, key), series, key + self, self._int64index.get_value(s, key), series, key ) else: raise KeyError(key) @@ -627,7 +628,7 @@ def get_value(self, series, key): period = Period(key, self.freq) key = period.value if isna(period) else period.ordinal - return com.maybe_box(self, self._engine.get_value(s, key), series, key) + return com.maybe_box(self, self._int64index.get_value(s, key), series, key) @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): @@ -648,6 +649,23 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): tolerance = self._convert_tolerance(tolerance, target) return Index.get_indexer(self._int64index, target, method, limit, tolerance) + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + target = ensure_index(target) + + if isinstance(target, PeriodIndex): + target = target.asi8 + if hasattr(target, "freq") and target.freq != self.freq: + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, + own_freq=self.freqstr, + other_freq=target.freqstr, + ) + raise IncompatibleFrequency(msg) + + indexer, missing = self._int64index.get_indexer_non_unique(target) + return ensure_platform_int(indexer), missing + def _get_unique_index(self, dropna=False): """ wrap Index._get_unique_index to handle NaT @@ -954,6 +972,12 @@ def base(self): ) return np.asarray(self._data) + def memory_usage(self, deep=False): + result = super().memory_usage(deep=deep) + if hasattr(self, "_cache") and "_int64index" in self._cache: + result += self._int64index.memory_usage(deep=deep) + return result + PeriodIndex._add_comparison_ops() PeriodIndex._add_numeric_methods_disabled() diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 721ea2b6e6632..61a9909926efe 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -604,6 +604,30 @@ def test_indexing_over_size_cutoff(): _index._SIZE_CUTOFF = old_cutoff +def test_indexing_over_size_cutoff_period_index(): + # GH 27136 + + old_cutoff = _index._SIZE_CUTOFF + try: + _index._SIZE_CUTOFF = 1000 + + n = 1100 + idx = pd.period_range("1/1/2000", freq="T", periods=n) + assert idx._engine.over_size_threshold + + s = pd.Series(np.random.randn(len(idx)), index=idx) + + pos = n - 1 + timestamp = idx[pos] + assert timestamp in s.index + + # it works! + s[timestamp] + assert len(s.loc[[timestamp]]) > 0 + finally: + _index._SIZE_CUTOFF = old_cutoff + + def test_indexing_unordered(): # GH 2437 rng = date_range(start="2011-01-01", end="2011-01-15")