Skip to content

Commit 5916788

Browse files
committed
PERF, BENCH: Fix performance issue when indexing into non-unique DatetimeIndex/PeriodIndex.
Additionally, fix asv benchmark bugs that hid the issue and correct PeriodIndex.memory_usage().
1 parent b870dee commit 5916788

File tree

5 files changed

+68
-11
lines changed

5 files changed

+68
-11
lines changed

asv_bench/benchmarks/indexing.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
from pandas import (Series, DataFrame, MultiIndex,
66
Int64Index, UInt64Index, Float64Index,
77
IntervalIndex, CategoricalIndex,
8-
IndexSlice, concat, date_range, option_context)
8+
IndexSlice, concat, date_range, option_context,
9+
period_range)
910

1011

1112
class NumericSeriesIndexing:
@@ -82,20 +83,30 @@ def time_loc_slice(self, index, index_structure):
8283
class NonNumericSeriesIndexing:
8384

8485
params = [
85-
('string', 'datetime'),
86-
('unique_monotonic_inc', 'nonunique_monotonic_inc'),
86+
('string', 'datetime', 'period'),
87+
('unique_monotonic_inc', 'nonunique_monotonic_inc', 'non_monotonic'),
8788
]
8889
param_names = ['index_dtype', 'index_structure']
8990

9091
def setup(self, index, index_structure):
9192
N = 10**6
92-
indexes = {'string': tm.makeStringIndex(N),
93-
'datetime': date_range('1900', periods=N, freq='s')}
94-
index = indexes[index]
93+
if index == 'string':
94+
index = tm.makeStringIndex(N)
95+
elif index == 'datetime':
96+
index = date_range('1900', periods=N, freq='s')
97+
elif index == 'period':
98+
index = period_range('1900', periods=N, freq='s')
99+
index = index.sort_values()
100+
assert index.is_unique and index.is_monotonic_increasing
95101
if index_structure == 'nonunique_monotonic_inc':
96102
index = index.insert(item=index[2], loc=2)[:-1]
103+
elif index_structure == 'non_monotonic':
104+
index = index[::2].append(index[1::2])
105+
assert len(index) == N
97106
self.s = Series(np.random.rand(N), index=index)
98107
self.lbl = index[80000]
108+
# warm up index mapping
109+
self.s[self.lbl]
99110

100111
def time_getitem_label_slice(self, index, index_structure):
101112
self.s[:self.lbl]

doc/source/whatsnew/v0.25.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,7 @@ Performance improvements
666666
- Restored performance of :meth:`DatetimeIndex.__iter__` by re-enabling specialized code path (:issue:`26702`)
667667
- Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`)
668668
- Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`)
669+
- Improved performance of :class:`DatetimeIndex` and :class:`PeriodIndex` slicing given non-unique, monotonic data (:issue:`27136`).
669670

670671
.. _whatsnew_0250.bug_fixes:
671672

pandas/core/indexes/base.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4563,7 +4563,6 @@ def get_indexer_non_unique(self, target):
45634563
return pself.get_indexer_non_unique(ptarget)
45644564

45654565
if self.is_all_dates:
4566-
self = Index(self.asi8)
45674566
tgt_values = target.asi8
45684567
else:
45694568
tgt_values = target._ndarray_values

pandas/core/indexes/period.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
from pandas.util._decorators import Appender, Substitution, cache_readonly
1212

1313
from pandas.core.dtypes.common import (
14-
is_bool_dtype, is_datetime64_any_dtype, is_float, is_float_dtype,
15-
is_integer, is_integer_dtype, pandas_dtype)
14+
ensure_platform_int, is_bool_dtype, is_datetime64_any_dtype, is_float,
15+
is_float_dtype, is_integer, is_integer_dtype, pandas_dtype)
1616

1717
from pandas.core import common as com
1818
from pandas.core.accessor import delegate_names
@@ -600,7 +600,8 @@ def get_value(self, series, key):
600600
return series[key]
601601
elif grp == freqn:
602602
key = Period(asdt, freq=self.freq).ordinal
603-
return com.maybe_box(self, self._engine.get_value(s, key),
603+
return com.maybe_box(self,
604+
self._int64index.get_value(s, key),
604605
series, key)
605606
else:
606607
raise KeyError(key)
@@ -609,7 +610,7 @@ def get_value(self, series, key):
609610

610611
period = Period(key, self.freq)
611612
key = period.value if isna(period) else period.ordinal
612-
return com.maybe_box(self, self._engine.get_value(s, key),
613+
return com.maybe_box(self, self._int64index.get_value(s, key),
613614
series, key)
614615

615616
@Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs)
@@ -630,6 +631,21 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
630631
return Index.get_indexer(self._int64index, target, method,
631632
limit, tolerance)
632633

634+
@Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs)
635+
def get_indexer_non_unique(self, target):
636+
target = ensure_index(target)
637+
638+
if isinstance(target, PeriodIndex):
639+
target = target.asi8
640+
if hasattr(target, 'freq') and target.freq != self.freq:
641+
msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
642+
own_freq=self.freqstr,
643+
other_freq=target.freqstr)
644+
raise IncompatibleFrequency(msg)
645+
646+
indexer, missing = self._int64index.get_indexer_non_unique(target)
647+
return ensure_platform_int(indexer), missing
648+
633649
def _get_unique_index(self, dropna=False):
634650
"""
635651
wrap Index._get_unique_index to handle NaT
@@ -906,6 +922,12 @@ def base(self):
906922
FutureWarning, stacklevel=2)
907923
return np.asarray(self._data)
908924

925+
def memory_usage(self, deep=False):
926+
result = super().memory_usage(deep=deep)
927+
if hasattr(self, '_cache') and '_int64index' in self._cache:
928+
result += self._int64index.memory_usage(deep=deep)
929+
return result
930+
909931

910932
PeriodIndex._add_comparison_ops()
911933
PeriodIndex._add_numeric_methods_disabled()

pandas/tests/series/indexing/test_datetime.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -576,6 +576,30 @@ def test_indexing_over_size_cutoff():
576576
_index._SIZE_CUTOFF = old_cutoff
577577

578578

579+
def test_indexing_over_size_cutoff_period_index():
580+
581+
old_cutoff = _index._SIZE_CUTOFF
582+
try:
583+
_index._SIZE_CUTOFF = 1000
584+
585+
n = 1100
586+
idx = pd.period_range('1/1/2000', freq='T', periods=n)
587+
assert idx._engine.over_size_threshold
588+
589+
s = pd.Series(np.random.randn(len(idx)),
590+
index=idx)
591+
592+
pos = n - 1
593+
timestamp = idx[pos]
594+
assert timestamp in s.index
595+
596+
# it works!
597+
s[timestamp]
598+
assert len(s.loc[[timestamp]]) > 0
599+
finally:
600+
_index._SIZE_CUTOFF = old_cutoff
601+
602+
579603
def test_indexing_unordered():
580604
# GH 2437
581605
rng = date_range(start='2011-01-01', end='2011-01-15')

0 commit comments

Comments
 (0)