PERF, BENCH: Fix performance issue when indexing into non-unique DatetimeIndex/PeriodIndex.

qwhelan · qwhelan · commit 59167881e410 · 2019-06-30T21:22:25.000-07:00
Additionally, fix asv benchmark bugs that hid the issue and correct PeriodIndex.memory_usage().
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -5,7 +5,8 @@
 from pandas import (Series, DataFrame, MultiIndex,
                     Int64Index, UInt64Index, Float64Index,
                     IntervalIndex, CategoricalIndex,
-                    IndexSlice, concat, date_range, option_context)
+                    IndexSlice, concat, date_range, option_context,
+                    period_range)
 
 
 class NumericSeriesIndexing:
@@ -82,20 +83,30 @@ def time_loc_slice(self, index, index_structure):
 class NonNumericSeriesIndexing:
 
     params = [
-        ('string', 'datetime'),
-        ('unique_monotonic_inc', 'nonunique_monotonic_inc'),
+        ('string', 'datetime', 'period'),
+        ('unique_monotonic_inc', 'nonunique_monotonic_inc', 'non_monotonic'),
     ]
     param_names = ['index_dtype', 'index_structure']
 
     def setup(self, index, index_structure):
         N = 10**6
-        indexes = {'string': tm.makeStringIndex(N),
-                   'datetime': date_range('1900', periods=N, freq='s')}
-        index = indexes[index]
+        if index == 'string':
+            index = tm.makeStringIndex(N)
+        elif index == 'datetime':
+            index = date_range('1900', periods=N, freq='s')
+        elif index == 'period':
+            index = period_range('1900', periods=N, freq='s')
+        index = index.sort_values()
+        assert index.is_unique and index.is_monotonic_increasing
         if index_structure == 'nonunique_monotonic_inc':
             index = index.insert(item=index[2], loc=2)[:-1]
+        elif index_structure == 'non_monotonic':
+            index = index[::2].append(index[1::2])
+            assert len(index) == N
         self.s = Series(np.random.rand(N), index=index)
         self.lbl = index[80000]
+        # warm up index mapping
+        self.s[self.lbl]
 
     def time_getitem_label_slice(self, index, index_structure):
         self.s[:self.lbl]
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -666,6 +666,7 @@ Performance improvements
 - Restored performance of :meth:`DatetimeIndex.__iter__` by re-enabling specialized code path (:issue:`26702`)
 - Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`)
 - Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`)
+- Improved performance of :class:`DatetimeIndex` and :class:`PeriodIndex` slicing given non-unique, monotonic data (:issue:`27136`).
 
 .. _whatsnew_0250.bug_fixes:
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4563,7 +4563,6 @@ def get_indexer_non_unique(self, target):
             return pself.get_indexer_non_unique(ptarget)
 
         if self.is_all_dates:
-            self = Index(self.asi8)
             tgt_values = target.asi8
         else:
             tgt_values = target._ndarray_values
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -11,8 +11,8 @@
 from pandas.util._decorators import Appender, Substitution, cache_readonly
 
 from pandas.core.dtypes.common import (
-    is_bool_dtype, is_datetime64_any_dtype, is_float, is_float_dtype,
-    is_integer, is_integer_dtype, pandas_dtype)
+    ensure_platform_int, is_bool_dtype, is_datetime64_any_dtype, is_float,
+    is_float_dtype, is_integer, is_integer_dtype, pandas_dtype)
 
 from pandas.core import common as com
 from pandas.core.accessor import delegate_names
@@ -600,7 +600,8 @@ def get_value(self, series, key):
                     return series[key]
                 elif grp == freqn:
                     key = Period(asdt, freq=self.freq).ordinal
-                    return com.maybe_box(self, self._engine.get_value(s, key),
+                    return com.maybe_box(self,
+                                         self._int64index.get_value(s, key),
                                          series, key)
                 else:
                     raise KeyError(key)
@@ -609,7 +610,7 @@ def get_value(self, series, key):
 
             period = Period(key, self.freq)
             key = period.value if isna(period) else period.ordinal
-            return com.maybe_box(self, self._engine.get_value(s, key),
+            return com.maybe_box(self, self._int64index.get_value(s, key),
                                  series, key)
 
     @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs)
@@ -630,6 +631,21 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         return Index.get_indexer(self._int64index, target, method,
                                  limit, tolerance)
 
+    @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs)
+    def get_indexer_non_unique(self, target):
+        target = ensure_index(target)
+
+        if isinstance(target, PeriodIndex):
+            target = target.asi8
+            if hasattr(target, 'freq') and target.freq != self.freq:
+                msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
+                                            own_freq=self.freqstr,
+                                            other_freq=target.freqstr)
+                raise IncompatibleFrequency(msg)
+
+        indexer, missing = self._int64index.get_indexer_non_unique(target)
+        return ensure_platform_int(indexer), missing
+
     def _get_unique_index(self, dropna=False):
         """
         wrap Index._get_unique_index to handle NaT
@@ -906,6 +922,12 @@ def base(self):
                       FutureWarning, stacklevel=2)
         return np.asarray(self._data)
 
+    def memory_usage(self, deep=False):
+        result = super().memory_usage(deep=deep)
+        if hasattr(self, '_cache') and '_int64index' in self._cache:
+            result += self._int64index.memory_usage(deep=deep)
+        return result
+
 
 PeriodIndex._add_comparison_ops()
 PeriodIndex._add_numeric_methods_disabled()
diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py
@@ -576,6 +576,30 @@ def test_indexing_over_size_cutoff():
         _index._SIZE_CUTOFF = old_cutoff
 
 
+def test_indexing_over_size_cutoff_period_index():
+
+    old_cutoff = _index._SIZE_CUTOFF
+    try:
+        _index._SIZE_CUTOFF = 1000
+
+        n = 1100
+        idx = pd.period_range('1/1/2000', freq='T', periods=n)
+        assert idx._engine.over_size_threshold
+
+        s = pd.Series(np.random.randn(len(idx)),
+                      index=idx)
+
+        pos = n - 1
+        timestamp = idx[pos]
+        assert timestamp in s.index
+
+        # it works!
+        s[timestamp]
+        assert len(s.loc[[timestamp]]) > 0
+    finally:
+        _index._SIZE_CUTOFF = old_cutoff
+
+
 def test_indexing_unordered():
     # GH 2437
     rng = date_range(start='2011-01-01', end='2011-01-15')