PERF, BENCH: Fix performance issue when indexing into non-unique DatetimeIndex/PeriodIndex. (#27136)

qwhelan · jreback · commit af5b2a257937 · 2019-07-06T18:11:47.000-04:00
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import pandas.util.testing as tm
+import warnings
 
 try:
     from pandas.api.types import union_categoricals
@@ -122,11 +123,16 @@ def setup(self):
         ncats = 100
 
         self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str)
-        self.s_str_cat = self.s_str.astype("category")
-        self.s_str_cat_ordered = self.s_str_cat.cat.as_ordered()
+        self.s_str_cat = pd.Series(self.s_str, dtype="category")
+        with warnings.catch_warnings(record=True):
+            str_cat_type = pd.CategoricalDtype(set(self.s_str), ordered=True)
+            self.s_str_cat_ordered = self.s_str.astype(str_cat_type)
+
         self.s_int = pd.Series(np.random.randint(0, ncats, size=N))
-        self.s_int_cat = self.s_int.astype("category")
-        self.s_int_cat_ordered = self.s_int_cat.cat.as_ordered()
+        self.s_int_cat = pd.Series(self.s_int, dtype="category")
+        with warnings.catch_warnings(record=True):
+            int_cat_type = pd.CategoricalDtype(set(self.s_int), ordered=True)
+            self.s_int_cat_ordered = self.s_int.astype(int_cat_type)
 
     def time_rank_string(self):
         self.s_str.rank()
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -15,6 +15,7 @@
     concat,
     date_range,
     option_context,
+    period_range,
 )
 
 
@@ -93,22 +94,30 @@ def time_loc_slice(self, index, index_structure):
 class NonNumericSeriesIndexing:
 
     params = [
-        ("string", "datetime"),
-        ("unique_monotonic_inc", "nonunique_monotonic_inc"),
+        ("string", "datetime", "period"),
+        ("unique_monotonic_inc", "nonunique_monotonic_inc", "non_monotonic"),
     ]
     param_names = ["index_dtype", "index_structure"]
 
     def setup(self, index, index_structure):
         N = 10 ** 6
-        indexes = {
-            "string": tm.makeStringIndex(N),
-            "datetime": date_range("1900", periods=N, freq="s"),
-        }
-        index = indexes[index]
+        if index == "string":
+            index = tm.makeStringIndex(N)
+        elif index == "datetime":
+            index = date_range("1900", periods=N, freq="s")
+        elif index == "period":
+            index = period_range("1900", periods=N, freq="s")
+        index = index.sort_values()
+        assert index.is_unique and index.is_monotonic_increasing
         if index_structure == "nonunique_monotonic_inc":
             index = index.insert(item=index[2], loc=2)[:-1]
+        elif index_structure == "non_monotonic":
+            index = index[::2].append(index[1::2])
+            assert len(index) == N
         self.s = Series(np.random.rand(N), index=index)
         self.lbl = index[80000]
+        # warm up index mapping
+        self.s[self.lbl]
 
     def time_getitem_label_slice(self, index, index_structure):
         self.s[: self.lbl]
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -938,6 +938,8 @@ Performance improvements
 - Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`)
 - Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`)
 - For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`)
+- Improved performance of :class:`DatetimeIndex` and :class:`PeriodIndex` slicing given non-unique, monotonic data (:issue:`27136`).
+
 
 .. _whatsnew_0250.bug_fixes:
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4792,7 +4792,6 @@ def get_indexer_non_unique(self, target):
             return pself.get_indexer_non_unique(ptarget)
 
         if self.is_all_dates:
-            self = Index(self.asi8)
             tgt_values = target.asi8
         else:
             tgt_values = target._ndarray_values
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -9,6 +9,7 @@
 from pandas.util._decorators import Appender, Substitution, cache_readonly
 
 from pandas.core.dtypes.common import (
+    ensure_platform_int,
     is_bool_dtype,
     is_datetime64_any_dtype,
     is_float,
@@ -618,7 +619,7 @@ def get_value(self, series, key):
                 elif grp == freqn:
                     key = Period(asdt, freq=self.freq).ordinal
                     return com.maybe_box(
-                        self, self._engine.get_value(s, key), series, key
+                        self, self._int64index.get_value(s, key), series, key
                     )
                 else:
                     raise KeyError(key)
@@ -627,7 +628,7 @@ def get_value(self, series, key):
 
             period = Period(key, self.freq)
             key = period.value if isna(period) else period.ordinal
-            return com.maybe_box(self, self._engine.get_value(s, key), series, key)
+            return com.maybe_box(self, self._int64index.get_value(s, key), series, key)
 
     @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
     def get_indexer(self, target, method=None, limit=None, tolerance=None):
@@ -648,6 +649,23 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             tolerance = self._convert_tolerance(tolerance, target)
         return Index.get_indexer(self._int64index, target, method, limit, tolerance)
 
+    @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
+    def get_indexer_non_unique(self, target):
+        target = ensure_index(target)
+
+        if isinstance(target, PeriodIndex):
+            target = target.asi8
+            if hasattr(target, "freq") and target.freq != self.freq:
+                msg = DIFFERENT_FREQ.format(
+                    cls=type(self).__name__,
+                    own_freq=self.freqstr,
+                    other_freq=target.freqstr,
+                )
+                raise IncompatibleFrequency(msg)
+
+        indexer, missing = self._int64index.get_indexer_non_unique(target)
+        return ensure_platform_int(indexer), missing
+
     def _get_unique_index(self, dropna=False):
         """
         wrap Index._get_unique_index to handle NaT
@@ -954,6 +972,12 @@ def base(self):
         )
         return np.asarray(self._data)
 
+    def memory_usage(self, deep=False):
+        result = super().memory_usage(deep=deep)
+        if hasattr(self, "_cache") and "_int64index" in self._cache:
+            result += self._int64index.memory_usage(deep=deep)
+        return result
+
 
 PeriodIndex._add_comparison_ops()
 PeriodIndex._add_numeric_methods_disabled()
diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py
@@ -604,6 +604,30 @@ def test_indexing_over_size_cutoff():
         _index._SIZE_CUTOFF = old_cutoff
 
 
+def test_indexing_over_size_cutoff_period_index():
+    # GH 27136
+
+    old_cutoff = _index._SIZE_CUTOFF
+    try:
+        _index._SIZE_CUTOFF = 1000
+
+        n = 1100
+        idx = pd.period_range("1/1/2000", freq="T", periods=n)
+        assert idx._engine.over_size_threshold
+
+        s = pd.Series(np.random.randn(len(idx)), index=idx)
+
+        pos = n - 1
+        timestamp = idx[pos]
+        assert timestamp in s.index
+
+        # it works!
+        s[timestamp]
+        assert len(s.loc[[timestamp]]) > 0
+    finally:
+        _index._SIZE_CUTOFF = old_cutoff
+
+
 def test_indexing_unordered():
     # GH 2437
     rng = date_range(start="2011-01-01", end="2011-01-15")