Skip to content

PERF, BENCH: Fix performance issue when indexing into non-unique DatetimeIndex/PeriodIndex. #27136

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 6, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
import pandas.util.testing as tm
import warnings

try:
from pandas.api.types import union_categoricals
Expand Down Expand Up @@ -122,11 +123,16 @@ def setup(self):
ncats = 100

self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str)
self.s_str_cat = self.s_str.astype("category")
self.s_str_cat_ordered = self.s_str_cat.cat.as_ordered()
self.s_str_cat = pd.Series(self.s_str, dtype="category")
with warnings.catch_warnings(record=True):
str_cat_type = pd.CategoricalDtype(set(self.s_str), ordered=True)
self.s_str_cat_ordered = self.s_str.astype(str_cat_type)

self.s_int = pd.Series(np.random.randint(0, ncats, size=N))
self.s_int_cat = self.s_int.astype("category")
self.s_int_cat_ordered = self.s_int_cat.cat.as_ordered()
self.s_int_cat = pd.Series(self.s_int, dtype="category")
with warnings.catch_warnings(record=True):
int_cat_type = pd.CategoricalDtype(set(self.s_int), ordered=True)
self.s_int_cat_ordered = self.s_int.astype(int_cat_type)

def time_rank_string(self):
self.s_str.rank()
Expand Down
23 changes: 16 additions & 7 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
concat,
date_range,
option_context,
period_range,
)


Expand Down Expand Up @@ -93,22 +94,30 @@ def time_loc_slice(self, index, index_structure):
class NonNumericSeriesIndexing:

params = [
("string", "datetime"),
("unique_monotonic_inc", "nonunique_monotonic_inc"),
("string", "datetime", "period"),
("unique_monotonic_inc", "nonunique_monotonic_inc", "non_monotonic"),
]
param_names = ["index_dtype", "index_structure"]

def setup(self, index, index_structure):
N = 10 ** 6
indexes = {
"string": tm.makeStringIndex(N),
"datetime": date_range("1900", periods=N, freq="s"),
}
index = indexes[index]
if index == "string":
index = tm.makeStringIndex(N)
elif index == "datetime":
index = date_range("1900", periods=N, freq="s")
elif index == "period":
index = period_range("1900", periods=N, freq="s")
index = index.sort_values()
assert index.is_unique and index.is_monotonic_increasing
if index_structure == "nonunique_monotonic_inc":
index = index.insert(item=index[2], loc=2)[:-1]
elif index_structure == "non_monotonic":
index = index[::2].append(index[1::2])
assert len(index) == N
self.s = Series(np.random.rand(N), index=index)
self.lbl = index[80000]
# warm up index mapping
self.s[self.lbl]

def time_getitem_label_slice(self, index, index_structure):
self.s[: self.lbl]
Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,8 @@ Performance improvements
- Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`)
- Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`)
- For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`)
- Improved performance of :class:`DatetimeIndex` and :class:`PeriodIndex` slicing given non-unique, monotonic data (:issue:`27136`).


.. _whatsnew_0250.bug_fixes:

Expand Down
1 change: 0 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4792,7 +4792,6 @@ def get_indexer_non_unique(self, target):
return pself.get_indexer_non_unique(ptarget)

if self.is_all_dates:
self = Index(self.asi8)
tgt_values = target.asi8
else:
tgt_values = target._ndarray_values
Expand Down
28 changes: 26 additions & 2 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pandas.util._decorators import Appender, Substitution, cache_readonly

from pandas.core.dtypes.common import (
ensure_platform_int,
is_bool_dtype,
is_datetime64_any_dtype,
is_float,
Expand Down Expand Up @@ -618,7 +619,7 @@ def get_value(self, series, key):
elif grp == freqn:
key = Period(asdt, freq=self.freq).ordinal
return com.maybe_box(
self, self._engine.get_value(s, key), series, key
self, self._int64index.get_value(s, key), series, key
)
else:
raise KeyError(key)
Expand All @@ -627,7 +628,7 @@ def get_value(self, series, key):

period = Period(key, self.freq)
key = period.value if isna(period) else period.ordinal
return com.maybe_box(self, self._engine.get_value(s, key), series, key)
return com.maybe_box(self, self._int64index.get_value(s, key), series, key)

@Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
def get_indexer(self, target, method=None, limit=None, tolerance=None):
Expand All @@ -648,6 +649,23 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
tolerance = self._convert_tolerance(tolerance, target)
return Index.get_indexer(self._int64index, target, method, limit, tolerance)

@Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
def get_indexer_non_unique(self, target):
target = ensure_index(target)

if isinstance(target, PeriodIndex):
target = target.asi8
if hasattr(target, "freq") and target.freq != self.freq:
msg = DIFFERENT_FREQ.format(
cls=type(self).__name__,
own_freq=self.freqstr,
other_freq=target.freqstr,
)
raise IncompatibleFrequency(msg)

indexer, missing = self._int64index.get_indexer_non_unique(target)
return ensure_platform_int(indexer), missing

def _get_unique_index(self, dropna=False):
"""
wrap Index._get_unique_index to handle NaT
Expand Down Expand Up @@ -954,6 +972,12 @@ def base(self):
)
return np.asarray(self._data)

def memory_usage(self, deep=False):
result = super().memory_usage(deep=deep)
if hasattr(self, "_cache") and "_int64index" in self._cache:
result += self._int64index.memory_usage(deep=deep)
return result


PeriodIndex._add_comparison_ops()
PeriodIndex._add_numeric_methods_disabled()
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/series/indexing/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,30 @@ def test_indexing_over_size_cutoff():
_index._SIZE_CUTOFF = old_cutoff


def test_indexing_over_size_cutoff_period_index():
# GH 27136

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment here (the title is pretty good though) with the issue number as a comment as well

old_cutoff = _index._SIZE_CUTOFF
try:
_index._SIZE_CUTOFF = 1000

n = 1100
idx = pd.period_range("1/1/2000", freq="T", periods=n)
assert idx._engine.over_size_threshold

s = pd.Series(np.random.randn(len(idx)), index=idx)

pos = n - 1
timestamp = idx[pos]
assert timestamp in s.index

# it works!
s[timestamp]
assert len(s.loc[[timestamp]]) > 0
finally:
_index._SIZE_CUTOFF = old_cutoff


def test_indexing_unordered():
# GH 2437
rng = date_range(start="2011-01-01", end="2011-01-15")
Expand Down