Skip to content

Commit af5b2a2

Browse files
qwhelanjreback
authored andcommitted
PERF, BENCH: Fix performance issue when indexing into non-unique DatetimeIndex/PeriodIndex. (#27136)
1 parent 20a85c3 commit af5b2a2

File tree

6 files changed

+78
-14
lines changed

6 files changed

+78
-14
lines changed

asv_bench/benchmarks/categoricals.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import numpy as np
22
import pandas as pd
33
import pandas.util.testing as tm
4+
import warnings
45

56
try:
67
from pandas.api.types import union_categoricals
@@ -122,11 +123,16 @@ def setup(self):
122123
ncats = 100
123124

124125
self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str)
125-
self.s_str_cat = self.s_str.astype("category")
126-
self.s_str_cat_ordered = self.s_str_cat.cat.as_ordered()
126+
self.s_str_cat = pd.Series(self.s_str, dtype="category")
127+
with warnings.catch_warnings(record=True):
128+
str_cat_type = pd.CategoricalDtype(set(self.s_str), ordered=True)
129+
self.s_str_cat_ordered = self.s_str.astype(str_cat_type)
130+
127131
self.s_int = pd.Series(np.random.randint(0, ncats, size=N))
128-
self.s_int_cat = self.s_int.astype("category")
129-
self.s_int_cat_ordered = self.s_int_cat.cat.as_ordered()
132+
self.s_int_cat = pd.Series(self.s_int, dtype="category")
133+
with warnings.catch_warnings(record=True):
134+
int_cat_type = pd.CategoricalDtype(set(self.s_int), ordered=True)
135+
self.s_int_cat_ordered = self.s_int.astype(int_cat_type)
130136

131137
def time_rank_string(self):
132138
self.s_str.rank()

asv_bench/benchmarks/indexing.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
concat,
1616
date_range,
1717
option_context,
18+
period_range,
1819
)
1920

2021

@@ -93,22 +94,30 @@ def time_loc_slice(self, index, index_structure):
9394
class NonNumericSeriesIndexing:
9495

9596
params = [
96-
("string", "datetime"),
97-
("unique_monotonic_inc", "nonunique_monotonic_inc"),
97+
("string", "datetime", "period"),
98+
("unique_monotonic_inc", "nonunique_monotonic_inc", "non_monotonic"),
9899
]
99100
param_names = ["index_dtype", "index_structure"]
100101

101102
def setup(self, index, index_structure):
102103
N = 10 ** 6
103-
indexes = {
104-
"string": tm.makeStringIndex(N),
105-
"datetime": date_range("1900", periods=N, freq="s"),
106-
}
107-
index = indexes[index]
104+
if index == "string":
105+
index = tm.makeStringIndex(N)
106+
elif index == "datetime":
107+
index = date_range("1900", periods=N, freq="s")
108+
elif index == "period":
109+
index = period_range("1900", periods=N, freq="s")
110+
index = index.sort_values()
111+
assert index.is_unique and index.is_monotonic_increasing
108112
if index_structure == "nonunique_monotonic_inc":
109113
index = index.insert(item=index[2], loc=2)[:-1]
114+
elif index_structure == "non_monotonic":
115+
index = index[::2].append(index[1::2])
116+
assert len(index) == N
110117
self.s = Series(np.random.rand(N), index=index)
111118
self.lbl = index[80000]
119+
# warm up index mapping
120+
self.s[self.lbl]
112121

113122
def time_getitem_label_slice(self, index, index_structure):
114123
self.s[: self.lbl]

doc/source/whatsnew/v0.25.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -938,6 +938,8 @@ Performance improvements
938938
- Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`)
939939
- Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`)
940940
- For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`)
941+
- Improved performance of :class:`DatetimeIndex` and :class:`PeriodIndex` slicing given non-unique, monotonic data (:issue:`27136`).
942+
941943
942944
.. _whatsnew_0250.bug_fixes:
943945

pandas/core/indexes/base.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4792,7 +4792,6 @@ def get_indexer_non_unique(self, target):
47924792
return pself.get_indexer_non_unique(ptarget)
47934793

47944794
if self.is_all_dates:
4795-
self = Index(self.asi8)
47964795
tgt_values = target.asi8
47974796
else:
47984797
tgt_values = target._ndarray_values

pandas/core/indexes/period.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from pandas.util._decorators import Appender, Substitution, cache_readonly
1010

1111
from pandas.core.dtypes.common import (
12+
ensure_platform_int,
1213
is_bool_dtype,
1314
is_datetime64_any_dtype,
1415
is_float,
@@ -618,7 +619,7 @@ def get_value(self, series, key):
618619
elif grp == freqn:
619620
key = Period(asdt, freq=self.freq).ordinal
620621
return com.maybe_box(
621-
self, self._engine.get_value(s, key), series, key
622+
self, self._int64index.get_value(s, key), series, key
622623
)
623624
else:
624625
raise KeyError(key)
@@ -627,7 +628,7 @@ def get_value(self, series, key):
627628

628629
period = Period(key, self.freq)
629630
key = period.value if isna(period) else period.ordinal
630-
return com.maybe_box(self, self._engine.get_value(s, key), series, key)
631+
return com.maybe_box(self, self._int64index.get_value(s, key), series, key)
631632

632633
@Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
633634
def get_indexer(self, target, method=None, limit=None, tolerance=None):
@@ -648,6 +649,23 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
648649
tolerance = self._convert_tolerance(tolerance, target)
649650
return Index.get_indexer(self._int64index, target, method, limit, tolerance)
650651

652+
@Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
653+
def get_indexer_non_unique(self, target):
654+
target = ensure_index(target)
655+
656+
if isinstance(target, PeriodIndex):
657+
target = target.asi8
658+
if hasattr(target, "freq") and target.freq != self.freq:
659+
msg = DIFFERENT_FREQ.format(
660+
cls=type(self).__name__,
661+
own_freq=self.freqstr,
662+
other_freq=target.freqstr,
663+
)
664+
raise IncompatibleFrequency(msg)
665+
666+
indexer, missing = self._int64index.get_indexer_non_unique(target)
667+
return ensure_platform_int(indexer), missing
668+
651669
def _get_unique_index(self, dropna=False):
652670
"""
653671
wrap Index._get_unique_index to handle NaT
@@ -954,6 +972,12 @@ def base(self):
954972
)
955973
return np.asarray(self._data)
956974

975+
def memory_usage(self, deep=False):
976+
result = super().memory_usage(deep=deep)
977+
if hasattr(self, "_cache") and "_int64index" in self._cache:
978+
result += self._int64index.memory_usage(deep=deep)
979+
return result
980+
957981

958982
PeriodIndex._add_comparison_ops()
959983
PeriodIndex._add_numeric_methods_disabled()

pandas/tests/series/indexing/test_datetime.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -604,6 +604,30 @@ def test_indexing_over_size_cutoff():
604604
_index._SIZE_CUTOFF = old_cutoff
605605

606606

607+
def test_indexing_over_size_cutoff_period_index():
608+
# GH 27136
609+
610+
old_cutoff = _index._SIZE_CUTOFF
611+
try:
612+
_index._SIZE_CUTOFF = 1000
613+
614+
n = 1100
615+
idx = pd.period_range("1/1/2000", freq="T", periods=n)
616+
assert idx._engine.over_size_threshold
617+
618+
s = pd.Series(np.random.randn(len(idx)), index=idx)
619+
620+
pos = n - 1
621+
timestamp = idx[pos]
622+
assert timestamp in s.index
623+
624+
# it works!
625+
s[timestamp]
626+
assert len(s.loc[[timestamp]]) > 0
627+
finally:
628+
_index._SIZE_CUTOFF = old_cutoff
629+
630+
607631
def test_indexing_unordered():
608632
# GH 2437
609633
rng = date_range(start="2011-01-01", end="2011-01-15")

0 commit comments

Comments
 (0)