ENH: tolerance now takes list-like argument for reindex and get_indexer.

Brian Tu · Brian Tu · commit f21c7224d457 · 2017-08-28T21:35:13.000-04:00
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -81,6 +81,7 @@ Other Enhancements
 - :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`)
 - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`)
 - Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`pd.read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here <io.parquet>`.
+- :func:`Series.reindex`, :func:`DataFrame.reindex`, :func:`Index.get_indexer` now support list-like argument for `tolerance`.
 
 .. _whatsnew_0210.api_breaking:
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2256,9 +2256,10 @@ def reindex_like(self, other, method=None, copy=True, limit=None,
             Maximum number of consecutive labels to fill for inexact matches.
         tolerance : optional
             Maximum distance between labels of the other object and this
-            object for inexact matches.
+            object for inexact matches. Can be list-like.
 
             .. versionadded:: 0.17.0
+            .. versionadded:: 0.21.0 (list-like tolerance)
 
         Notes
         -----
@@ -2596,8 +2597,14 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
             Maximum distance between original and new labels for inexact
             matches. The values of the index at the matching locations most
             satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
+            Tolerance may be a scalar value, which applies the same tolerance
+            to all values, or list-like, which applies variable tolerance per
+            element. List-like includes list, tuple, array, Series, and must be
+            the same size as the index and its dtype must exactly match the
+            index's type.
 
             .. versionadded:: 0.17.0
+            .. versionadded:: 0.21.0 (list-like tolerance)
 
         Examples
         --------
@@ -2819,8 +2826,14 @@ def _reindex_multi(self, axes, copy, fill_value):
             Maximum distance between original and new labels for inexact
             matches. The values of the index at the matching locations most
             satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
+            Tolerance may be a scalar value, which applies the same tolerance
+            to all values, or list-like, which applies variable tolerance per
+            element. List-like includes list, tuple, array, Series, and must be
+            the same size as the index and its dtype must exactly match the
+            index's type.
 
             .. versionadded:: 0.17.0
+            .. versionadded:: 0.21.0 (list-like tolerance)
 
         Examples
         --------
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -85,7 +85,6 @@ class InvalidIndexError(Exception):
 _o_dtype = np.dtype(object)
 _Identity = object
 
-
 def _new_Index(cls, d):
     """ This is called upon unpickling, rather than the default which doesn't
     have arguments and breaks __new__
@@ -2436,9 +2435,14 @@ def _get_unique_index(self, dropna=False):
         tolerance : optional
             Maximum distance from index value for inexact matches. The value of
             the index at the matching location most satisfy the equation
-            ``abs(index[loc] - key) <= tolerance``.
+            ``abs(index[loc] - key) <= tolerance``. Tolerance may be a scalar
+            value, which applies the same tolerance to all values, or
+            list-like, which applies variable tolerance per element. List-like
+            includes list, tuple, array, Series, and must be the same size as
+            the index and its dtype must exactly match the index's type.
 
             .. versionadded:: 0.17.0
+            .. versionadded:: 0.21.0 (list-like tolerance)
 
         Returns
         -------
@@ -2558,8 +2562,14 @@ def _get_level_values(self, level):
             Maximum distance between original and new labels for inexact
             matches. The values of the index at the matching locations most
             satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
+            Tolerance may be a scalar value, which applies the same tolerance
+            to all values, or list-like, which applies variable tolerance per
+            element. List-like includes list, tuple, array, Series, and must be
+            the same size as the index and its dtype must exactly match the
+            index's type.
 
             .. versionadded:: 0.17.0
+            .. versionadded:: 0.21.0 (list-like tolerance)
 
         Examples
         --------
@@ -2580,6 +2590,10 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         target = _ensure_index(target)
         if tolerance is not None:
             tolerance = self._convert_tolerance(tolerance)
+            if isinstance(tolerance, np.ndarray) and \
+               target.size != tolerance.size and tolerance.size > 1:
+                raise ValueError('ndarray tolerance size must match '
+                        'target index size')
 
         pself, ptarget = self._maybe_promote(target)
         if pself is not self or ptarget is not target:
@@ -2614,7 +2628,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
     def _convert_tolerance(self, tolerance):
         # override this method on subclasses
-        return tolerance
+        return _list_to_ndarray(tolerance)
 
     def _get_fill_indexer(self, target, method, limit=None, tolerance=None):
         if self.is_monotonic_increasing and target.is_monotonic_increasing:
@@ -4008,6 +4022,16 @@ def invalid_op(self, other=None):
 Index._add_comparison_methods()
 
 
+def _list_to_ndarray(a):
+    """Convert list-like to np.ndarray, otherwise leave as-is.
+    Used for converting tolerance to ndarray in _convert_tolerance.
+    """
+    if isinstance(a, ABCSeries):
+        return a.values
+    elif isinstance(a, (list, tuple)):
+        return np.array(a)
+    return a
+
 def _ensure_index(index_like, copy=False):
     if isinstance(index_like, Index):
         if copy:
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
@@ -27,7 +27,8 @@
                           Timedelta, Timestamp, iNaT, NaT)
 from pandas._libs.period import Period
 
-from pandas.core.indexes.base import Index, _index_shared_docs
+from pandas.core.indexes.base import (Index, _index_shared_docs,
+        _list_to_ndarray)
 from pandas.util._decorators import Appender, cache_readonly
 import pandas.core.dtypes.concat as _concat
 import pandas.tseries.frequencies as frequencies
@@ -432,12 +433,35 @@ def asobject(self):
         return Index(self._box_values(self.asi8), name=self.name, dtype=object)
 
     def _convert_tolerance(self, tolerance):
-        try:
-            return Timedelta(tolerance).to_timedelta64()
-        except ValueError:
-            raise ValueError('tolerance argument for %s must be convertible '
-                             'to Timedelta: %r'
-                             % (type(self).__name__, tolerance))
+        tolerance = _list_to_ndarray(tolerance)
+        if isinstance(tolerance, np.ndarray):
+            if np.issubdtype(tolerance.dtype, np.timedelta64):
+                return tolerance
+            else:
+                try:
+                    tolerance = np.array([np.timedelta64(x)
+                        for x in tolerance])
+                    # in case user mixes something like seconds and Month
+                    if not np.issubdtype(tolerance.dtype, np.timedelta64):
+                        raise TypeError('All values in tolerance array must '
+                                'be convertible to [ns]')
+                except ValueError as e:
+                    raise TypeError(('tolerance argument for %s must contain '
+                        'objects convertible to np.timedelta64 '
+                        'if it is list type') %
+                        (type(self).__name__,)) from e
+                else:
+                    warnings.warn('Converting tolerance array to '
+                    'np.timedelta64 objects, consider doing preconverting '
+                    'for speed')
+                return tolerance
+        else:
+            try:
+                return Timedelta(tolerance).to_timedelta64()
+            except ValueError:
+                raise ValueError(('tolerance argument for %s must be '
+                    'convertible to Timedelta if it is a scalar: %r')
+                    % (type(self).__name__, tolerance))
 
     def _maybe_mask_results(self, result, fill_value=None, convert=None):
         """
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -1435,7 +1435,12 @@ def get_loc(self, key, method=None, tolerance=None):
             try:
                 stamp = Timestamp(key, tz=self.tz)
                 return Index.get_loc(self, stamp, method, tolerance)
-            except (KeyError, ValueError):
+            except KeyError:
+                raise KeyError(key)
+            except ValueError as e:
+                # ndarray tolerance size must match target index size
+                if 'ndarray' in str(e):
+                    raise e
                 raise KeyError(key)
 
     def _maybe_cast_slice_bound(self, label, side, kind):
diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py
@@ -15,7 +15,7 @@
 from pandas import compat
 from pandas.core import algorithms
 from pandas.core.indexes.base import (
-    Index, InvalidIndexError, _index_shared_docs)
+    Index, InvalidIndexError, _index_shared_docs, _list_to_ndarray)
 from pandas.util._decorators import Appender, cache_readonly
 import pandas.core.indexes.base as ibase
 
@@ -72,11 +72,20 @@ def _convert_for_op(self, value):
         return value
 
     def _convert_tolerance(self, tolerance):
-        try:
-            return float(tolerance)
-        except ValueError:
-            raise ValueError('tolerance argument for %s must be numeric: %r' %
-                             (type(self).__name__, tolerance))
+        tolerance = _list_to_ndarray(tolerance)
+        if isinstance(tolerance, np.ndarray):
+            if np.issubdtype(tolerance.dtype, np.number):
+                return tolerance
+            else:
+                raise ValueError(('tolerance argument for %s must contain '
+                'numeric elements if it is list type') % (type(self).__name__,))
+        else:
+            try:
+                return float(tolerance)
+            except ValueError:
+                raise ValueError(('tolerance argument for %s must be numeric '
+                    'if it is a scalar: %r') %
+                    (type(self).__name__, tolerance))
 
     @classmethod
     def _assert_safe_casting(cls, data, subarr):
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -633,12 +633,17 @@ def to_timestamp(self, freq=None, how='start'):
         return DatetimeIndex(new_data, freq='infer', name=self.name)
 
     def _maybe_convert_timedelta(self, other):
-        if isinstance(other, (timedelta, np.timedelta64, offsets.Tick)):
+        if isinstance(other,
+                (timedelta, np.timedelta64, offsets.Tick, np.ndarray)):
             offset = frequencies.to_offset(self.freq.rule_code)
             if isinstance(offset, offsets.Tick):
                 nanos = tslib._delta_to_nanoseconds(other)
                 offset_nanos = tslib._delta_to_nanoseconds(offset)
-                if nanos % offset_nanos == 0:
+                if isinstance(other, np.ndarray):
+                    check = np.all(nanos % offset_nanos == 0)
+                else:
+                    check = nanos % offset_nanos == 0
+                if check:
                     return nanos // offset_nanos
         elif isinstance(other, offsets.DateOffset):
             freqstr = other.rule_code
@@ -775,6 +780,10 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         if tolerance is not None:
             tolerance = self._convert_tolerance(tolerance)
+            if isinstance(tolerance, np.ndarray) and \
+               target.size != tolerance.size and tolerance.size > 1:
+                raise ValueError('ndarray tolerance size must match '
+                        'target index size')
         return Index.get_indexer(self._int64index, target, method,
                                  limit, tolerance)
 
@@ -902,6 +911,10 @@ def _get_string_slice(self, key):
 
     def _convert_tolerance(self, tolerance):
         tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance)
+        if isinstance(tolerance, np.ndarray) \
+                and not np.issubdtype(tolerance.dtype, np.timedelta64):
+            raise TypeError('All values in tolerance array must be '
+                'convertible to [ns]')
         return self._maybe_convert_timedelta(tolerance)
 
     def insert(self, loc, item):
diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py
@@ -1905,9 +1905,13 @@ def test_reindex_methods(self):
 
             actual = df.reindex_like(df, method=method, tolerance=0)
             assert_frame_equal(df, actual)
+            actual = df.reindex_like(df, method=method, tolerance=[0,0,0,0])
+            assert_frame_equal(df, actual)
 
             actual = df.reindex(target, method=method, tolerance=1)
             assert_frame_equal(expected, actual)
+            actual = df.reindex(target, method=method, tolerance=[1,1,1,1])
+            assert_frame_equal(expected, actual)
 
             e2 = expected[::-1]
             actual = df.reindex(target[::-1], method=method)
@@ -1928,6 +1932,11 @@ def test_reindex_methods(self):
         actual = df.reindex(target, method='nearest', tolerance=0.2)
         assert_frame_equal(expected, actual)
 
+        expected = pd.DataFrame({'x': [0, np.nan, 1, np.nan]}, index=target)
+        actual = df.reindex(target, method='nearest',
+                tolerance=[0.5, 0.01, 0.4, 0.1])
+        assert_frame_equal(expected, actual)
+
     def test_reindex_frame_add_nat(self):
         rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s')
         df = DataFrame({'A': np.random.randn(len(rng)), 'B': rng})
diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py
@@ -45,6 +45,11 @@ def test_get_loc(self):
             idx.get_loc('2000-01-01T12', method='nearest', tolerance='foo')
         with pytest.raises(KeyError):
             idx.get_loc('2000-01-01T03', method='nearest', tolerance='2 hours')
+        with pytest.raises(ValueError,
+                match='tolerance size must match target index size'):
+            idx.get_loc('2000-01-01', method='nearest',
+                    tolerance=[pd.Timedelta('1day').to_timedelta64(),
+                        pd.Timedelta('1day').to_timedelta64()])
 
         assert idx.get_loc('2000', method='nearest') == slice(0, 3)
         assert idx.get_loc('2000-01', method='nearest') == slice(0, 3)
@@ -93,6 +98,30 @@ def test_get_indexer(self):
             idx.get_indexer(target, 'nearest',
                             tolerance=pd.Timedelta('1 hour')),
             np.array([0, -1, 1], dtype=np.intp))
+        tol_raw = [pd.Timedelta('1 hour'),
+                   pd.Timedelta('1 hour'),
+                   pd.Timedelta('1 hour').to_timedelta64(), ]
+        with pytest.warns(UserWarning) as speedwarning:
+            tm.assert_numpy_array_equal(
+                idx.get_indexer(target, 'nearest',
+                                tolerance=tol_raw),
+                np.array([0, -1, 1], dtype=np.intp))
+        assert len(speedwarning) == 1
+        assert speedwarning[0].message.args[0]\
+            .endswith('preconverting for speed')
+        tm.assert_numpy_array_equal(
+            idx.get_indexer(target, 'nearest',
+                            tolerance=[np.timedelta64(x) for x in tol_raw]),
+            np.array([0, -1, 1], dtype=np.intp))
+        with pytest.raises(TypeError, match=('must contain objects '
+            'convertible to np.timedelta64')):
+            idx.get_indexer(target, 'nearest', tolerance=[1,2,3])
+        tol_bad = [pd.Timedelta('2 hour').to_timedelta64(),
+                   pd.Timedelta('1 hour').to_timedelta64(),
+                   np.timedelta64(1, 'M'), ]
+        with pytest.raises(TypeError, match=('All values.*'
+            'convertible to \\[ns\\]')), pytest.warns(UserWarning):
+            idx.get_indexer(target, 'nearest', tolerance=tol_bad)
         with pytest.raises(ValueError):
             idx.get_indexer(idx[[0]], method='nearest', tolerance='foo')
 
diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py
@@ -89,6 +89,11 @@ def test_get_loc(self):
             idx.get_loc('2000-01-10', method='nearest', tolerance='1 hour')
         with pytest.raises(KeyError):
             idx.get_loc('2000-01-10', method='nearest', tolerance='1 day')
+        with pytest.raises(ValueError, match=('ndarray tolerance size must '
+            'match target index size')):
+            idx.get_loc('2000-01-10', method='nearest',
+                    tolerance=[pd.Timedelta('1 day').to_timedelta64(),
+                        pd.Timedelta('1 day').to_timedelta64()])
 
     def test_where(self):
         i = self.create_index()
@@ -156,6 +161,30 @@ def test_get_indexer(self):
         tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest',
                                                     tolerance='1 day'),
                                     np.array([0, 1, 1], dtype=np.intp))
+        tol_raw = [pd.Timedelta('1 hour'),
+                   pd.Timedelta('1 hour'),
+                   np.timedelta64(1, 'D'), ]
+        with pytest.warns(UserWarning) as speedwarning:
+            tm.assert_numpy_array_equal(
+                idx.get_indexer(target, 'nearest',
+                                tolerance=tol_raw),
+                np.array([0, -1, 1], dtype=np.intp))
+        assert len(speedwarning) == 1
+        assert speedwarning[0].message.args[0]\
+            .endswith('preconverting for speed')
+        tm.assert_numpy_array_equal(
+            idx.get_indexer(target, 'nearest',
+                            tolerance=[np.timedelta64(x) for x in tol_raw]),
+            np.array([0, -1, 1], dtype=np.intp))
+        tol_bad = [pd.Timedelta('2 hour').to_timedelta64(),
+                   pd.Timedelta('1 hour').to_timedelta64(),
+                   np.timedelta64(1, 'M'), ]
+        with pytest.raises(TypeError, match=('All values.*'
+            'convertible to \\[ns\\]')), pytest.warns(UserWarning):
+            idx.get_indexer(target, 'nearest', tolerance=tol_bad)
+        with pytest.raises(TypeError, match=('must contain objects '
+            'convertible to np.timedelta64')):
+            idx.get_indexer(target, 'nearest', tolerance=[1,2,3])
 
     def test_repeat(self):
         # GH10183
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py
diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py
diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py
diff --git a/pandas/tests/sparse/test_indexing.py b/pandas/tests/sparse/test_indexing.py