From b735ffc75e4cea35b14866ada4c58a42551fb9dc Mon Sep 17 00:00:00 2001 From: immerrr Date: Thu, 6 Nov 2014 21:59:54 +0300 Subject: [PATCH] BUG: fix negative step support for label-based slices INT: make Index.slice_locs step-aware BUG: fix PeriodIndex.searchsorted to accept Periods INT: refactor time-related indices to use step-aware slice_locs INT: refactor MultiIndex to use step-aware slice_locs INT: enable second/microsecond partial string slicing --- doc/source/whatsnew/v0.15.2.txt | 23 +++ pandas/core/index.py | 202 ++++++++++++++++-------- pandas/tests/test_index.py | 30 +++- pandas/tests/test_indexing.py | 58 +++++++ pandas/tseries/base.py | 46 ------ pandas/tseries/index.py | 157 ++++++++++++++---- pandas/tseries/period.py | 97 ++++++------ pandas/tseries/tdi.py | 26 +++ pandas/tseries/tests/test_period.py | 44 +++++- pandas/tseries/tests/test_timedeltas.py | 34 ++++ pandas/tseries/tests/test_timeseries.py | 70 +++++++- 11 files changed, 594 insertions(+), 193 deletions(-) diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 61d18da45e5f0..eb2446d2a50d3 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -70,7 +70,30 @@ Bug Fixes - ``Timedelta`` kwargs may now be numpy ints and floats (:issue:`8757`). - ``sql_schema`` now generates dialect appropriate ``CREATE TABLE`` statements (:issue:`8697`) - ``slice`` string method now takes step into account (:issue:`8754`) +- Fix negative step support for label-based slices (:issue:`8753`) + Old behavior: + + .. code-block:: python + + In [1]: s = pd.Series(np.arange(3), ['a', 'b', 'c']) + Out[1]: + a 0 + b 1 + c 2 + dtype: int64 + + In [2]: s.loc['c':'a':-1] + Out[2]: + c 2 + dtype: int64 + + New behavior: + + .. ipython:: python + + s = pd.Series(np.arange(3), ['a', 'b', 'c']) + s.loc['c':'a':-1] diff --git a/pandas/core/index.py b/pandas/core/index.py index 6702a21167850..3f0b45ae10988 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1959,23 +1959,99 @@ def slice_indexer(self, start=None, end=None, step=None): ----- This function assumes that the data is sorted, so use at your own peril """ - start_slice, end_slice = self.slice_locs(start, end) + start_slice, end_slice = self.slice_locs(start, end, step=step) # return a slice - if np.isscalar(start_slice) and np.isscalar(end_slice): + if not lib.isscalar(start_slice): + raise AssertionError("Start slice bound is non-scalar") + if not lib.isscalar(end_slice): + raise AssertionError("End slice bound is non-scalar") - # degenerate cases - if start is None and end is None: - return slice(None, None, step) + return slice(start_slice, end_slice, step) - return slice(start_slice, end_slice, step) + def _maybe_cast_slice_bound(self, label, side): + """ + This function should be overloaded in subclasses that allow non-trivial + casting on label-slice bounds, e.g. datetime-like indices allowing + strings containing formatted datetimes. - # loc indexers - return (Index(start_slice) & Index(end_slice)).values + Parameters + ---------- + label : object + side : {'left', 'right'} + + Notes + ----- + Value of `side` parameter should be validated in caller. - def slice_locs(self, start=None, end=None): """ - For an ordered Index, compute the slice locations for input labels + return label + + def get_slice_bound(self, label, side): + """ + Calculate slice bound that corresponds to given label. + + Returns leftmost (one-past-the-rightmost if ``side=='right'``) position + of given label. + + Parameters + ---------- + label : object + side : {'left', 'right'} + + """ + if side not in ('left', 'right'): + raise ValueError( + "Invalid value for side kwarg," + " must be either 'left' or 'right': %s" % (side,)) + + original_label = label + # For datetime indices label may be a string that has to be converted + # to datetime boundary according to its resolution. + label = self._maybe_cast_slice_bound(label, side) + + try: + slc = self.get_loc(label) + except KeyError: + if self.is_monotonic_increasing: + return self.searchsorted(label, side=side) + elif self.is_monotonic_decreasing: + # np.searchsorted expects ascending sort order, have to reverse + # everything for it to work (element ordering, search side and + # resulting value). + pos = self[::-1].searchsorted( + label, side='right' if side == 'left' else 'right') + return len(self) - pos + + # In all other cases, just re-raise the KeyError + raise + + if isinstance(slc, np.ndarray): + # get_loc may return a boolean array or an array of indices, which + # is OK as long as they are representable by a slice. + if com.is_bool_dtype(slc): + slc = lib.maybe_booleans_to_slice(slc.view('u1')) + else: + slc = lib.maybe_indices_to_slice(slc.astype('i8')) + if isinstance(slc, np.ndarray): + raise KeyError( + "Cannot get %s slice bound for non-unique label:" + " %r" % (side, original_label)) + + if isinstance(slc, slice): + if side == 'left': + return slc.start + else: + return slc.stop + else: + if side == 'right': + return slc + 1 + else: + return slc + + def slice_locs(self, start=None, end=None, step=None): + """ + Compute slice locations for input labels. Parameters ---------- @@ -1986,51 +2062,51 @@ def slice_locs(self, start=None, end=None): Returns ------- - (start, end) : (int, int) + start, end : int - Notes - ----- - This function assumes that the data is sorted, so use at your own peril """ + inc = (step is None or step >= 0) - is_unique = self.is_unique - - def _get_slice(starting_value, offset, search_side, slice_property, - search_value): - if search_value is None: - return starting_value + if not inc: + # If it's a reverse slice, temporarily swap bounds. + start, end = end, start - try: - slc = self.get_loc(search_value) - - if not is_unique: - - # get_loc will return a boolean array for non_uniques - # if we are not monotonic - if isinstance(slc, (np.ndarray, Index)): - raise KeyError("cannot peform a slice operation " - "on a non-unique non-monotonic index") - - if isinstance(slc, slice): - slc = getattr(slc, slice_property) - else: - slc += offset + start_slice = None + if start is not None: + start_slice = self.get_slice_bound(start, 'left') + if start_slice is None: + start_slice = 0 - except KeyError: - if self.is_monotonic_increasing: - slc = self.searchsorted(search_value, side=search_side) - elif self.is_monotonic_decreasing: - search_side = 'right' if search_side == 'left' else 'left' - slc = len(self) - self[::-1].searchsorted(search_value, - side=search_side) - else: - raise - return slc + end_slice = None + if end is not None: + end_slice = self.get_slice_bound(end, 'right') + if end_slice is None: + end_slice = len(self) - start_slice = _get_slice(0, offset=0, search_side='left', - slice_property='start', search_value=start) - end_slice = _get_slice(len(self), offset=1, search_side='right', - slice_property='stop', search_value=end) + if not inc: + # Bounds at this moment are swapped, swap them back and shift by 1. + # + # slice_locs('B', 'A', step=-1): s='B', e='A' + # + # s='A' e='B' + # AFTER SWAP: | | + # v ------------------> V + # ----------------------------------- + # | | |A|A|A|A| | | | | |B|B| | | | | + # ----------------------------------- + # ^ <------------------ ^ + # SHOULD BE: | | + # end=s-1 start=e-1 + # + end_slice, start_slice = start_slice - 1, end_slice - 1 + + # i == -1 triggers ``len(self) + i`` selection that points to the + # last element, not before-the-first one, subtracting len(self) + # compensates that. + if end_slice == -1: + end_slice -= len(self) + if start_slice == -1: + start_slice -= len(self) return start_slice, end_slice @@ -3887,7 +3963,12 @@ def _tuple_index(self): """ return Index(self.values) - def slice_locs(self, start=None, end=None, strict=False): + def get_slice_bound(self, label, side): + if not isinstance(label, tuple): + label = label, + return self._partial_tup_index(label, side=side) + + def slice_locs(self, start=None, end=None, step=None): """ For an ordered MultiIndex, compute the slice locations for input labels. They can be tuples representing partial levels, e.g. for a @@ -3900,7 +3981,8 @@ def slice_locs(self, start=None, end=None, strict=False): If None, defaults to the beginning end : label or tuple If None, defaults to the end - strict : boolean, + step : int or None + Slice step Returns ------- @@ -3910,21 +3992,9 @@ def slice_locs(self, start=None, end=None, strict=False): ----- This function assumes that the data is sorted by the first level """ - if start is None: - start_slice = 0 - else: - if not isinstance(start, tuple): - start = start, - start_slice = self._partial_tup_index(start, side='left') - - if end is None: - end_slice = len(self) - else: - if not isinstance(end, tuple): - end = end, - end_slice = self._partial_tup_index(end, side='right') - - return start_slice, end_slice + # This function adds nothing to its parent implementation (the magic + # happens in get_slice_bound method), but it adds meaningful doc. + return super(MultiIndex, self).slice_locs(start, end, step) def _partial_tup_index(self, tup, side='left'): if len(tup) > self.lexsort_depth: diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index cca8324b42b93..b7a18da3924c8 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -910,8 +910,34 @@ def test_slice_locs_na(self): self.assertEqual(idx.slice_locs(1), (1, 3)) self.assertEqual(idx.slice_locs(np.nan), (0, 3)) - idx = Index([np.nan, np.nan, 1, 2]) - self.assertRaises(KeyError, idx.slice_locs, np.nan) + idx = Index([0, np.nan, np.nan, 1, 2]) + self.assertEqual(idx.slice_locs(np.nan), (1, 5)) + + def test_slice_locs_negative_step(self): + idx = Index(list('bcdxy')) + + SLC = pd.IndexSlice + + def check_slice(in_slice, expected): + s_start, s_stop = idx.slice_locs(in_slice.start, in_slice.stop, + in_slice.step) + result = idx[s_start:s_stop:in_slice.step] + expected = pd.Index(list(expected)) + self.assertTrue(result.equals(expected)) + + for in_slice, expected in [ + (SLC[::-1], 'yxdcb'), (SLC['b':'y':-1], ''), + (SLC['b'::-1], 'b'), (SLC[:'b':-1], 'yxdcb'), + (SLC[:'y':-1], 'y'), (SLC['y'::-1], 'yxdcb'), + (SLC['y'::-4], 'yb'), + # absent labels + (SLC[:'a':-1], 'yxdcb'), (SLC[:'a':-2], 'ydb'), + (SLC['z'::-1], 'yxdcb'), (SLC['z'::-3], 'yc'), + (SLC['m'::-1], 'dcb'), (SLC[:'m':-1], 'yx'), + (SLC['a':'a':-1], ''), (SLC['z':'z':-1], ''), + (SLC['m':'m':-1], '') + ]: + check_slice(in_slice, expected) def test_drop(self): n = len(self.strIndex) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 66307d58b2f27..76be2e64de8d0 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -4141,6 +4141,64 @@ def run_tests(df, rhs, right): run_tests(df, rhs, right) + def test_str_label_slicing_with_negative_step(self): + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) + + if not idx.is_integer: + # For integer indices, ix and plain getitem are position-based. + assert_series_equal(s[l_slc], s.iloc[i_slc]) + assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) + + for idx in [_mklbl('A', 20), np.arange(20) + 100, + np.linspace(100, 150, 20)]: + idx = Index(idx) + s = Series(np.arange(20), index=idx) + assert_slices_equivalent(SLC[idx[9]::-1], SLC[9::-1]) + assert_slices_equivalent(SLC[:idx[9]:-1], SLC[:8:-1]) + assert_slices_equivalent(SLC[idx[13]:idx[9]:-1], SLC[13:8:-1]) + assert_slices_equivalent(SLC[idx[9]:idx[13]:-1], SLC[:0]) + + def test_multiindex_label_slicing_with_negative_step(self): + s = Series(np.arange(20), + MultiIndex.from_product([list('abcde'), np.arange(4)])) + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) + assert_series_equal(s[l_slc], s.iloc[i_slc]) + assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) + + assert_slices_equivalent(SLC[::-1], SLC[::-1]) + + assert_slices_equivalent(SLC['d'::-1], SLC[15::-1]) + assert_slices_equivalent(SLC[('d',)::-1], SLC[15::-1]) + + assert_slices_equivalent(SLC[:'d':-1], SLC[:11:-1]) + assert_slices_equivalent(SLC[:('d',):-1], SLC[:11:-1]) + + assert_slices_equivalent(SLC['d':'b':-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC[('d',):'b':-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC['d':('b',):-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC[('d',):('b',):-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC['b':'d':-1], SLC[:0]) + + assert_slices_equivalent(SLC[('c', 2)::-1], SLC[10::-1]) + assert_slices_equivalent(SLC[:('c', 2):-1], SLC[:9:-1]) + assert_slices_equivalent(SLC[('e', 0):('c', 2):-1], SLC[16:9:-1]) + + def test_slice_with_zero_step_raises(self): + s = Series(np.arange(20), index=_mklbl('A', 20)) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: s[::0]) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: s.loc[::0]) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: s.ix[::0]) + + class TestSeriesNoneCoercion(tm.TestCase): EXPECTED_RESULTS = [ # For numeric series, we should coerce to NaN. diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 0a446919e95d2..d47544149c381 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -114,52 +114,6 @@ def take(self, indices, axis=0): return self[maybe_slice] return super(DatetimeIndexOpsMixin, self).take(indices, axis) - def slice_locs(self, start=None, end=None): - """ - Index.slice_locs, customized to handle partial ISO-8601 string slicing - """ - if isinstance(start, compat.string_types) or isinstance(end, compat.string_types): - - if self.is_monotonic: - try: - if start: - start_loc = self._get_string_slice(start).start - else: - start_loc = 0 - - if end: - end_loc = self._get_string_slice(end).stop - else: - end_loc = len(self) - - return start_loc, end_loc - except KeyError: - pass - - else: - # can't use a slice indexer because we are not sorted! - # so create an indexer directly - try: - if start: - start_loc = self._get_string_slice(start, - use_rhs=False) - else: - start_loc = np.arange(len(self)) - - if end: - end_loc = self._get_string_slice(end, use_lhs=False) - else: - end_loc = np.arange(len(self)) - - return start_loc, end_loc - except KeyError: - pass - - if isinstance(start, time) or isinstance(end, time): - raise KeyError('Cannot use slice_locs with time slice keys') - - return Index.slice_locs(self, start, end) - def get_duplicates(self): values = Index.get_duplicates(self) return self._simple_new(values) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index bf99de902188f..202e30cc2eb5e 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1092,51 +1092,83 @@ def intersection(self, other): left_chunk = left.values[lslice] return self._shallow_copy(left_chunk) - def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True): + def _parsed_string_to_bounds(self, reso, parsed): + """ + Calculate datetime bounds for parsed time string and its resolution. - is_monotonic = self.is_monotonic + Parameters + ---------- + reso : Resolution + Resolution provided by parsed string. + parsed : datetime + Datetime from parsed string. + Returns + ------- + lower, upper: pd.Timestamp + + """ + is_monotonic = self.is_monotonic if reso == 'year': - t1 = Timestamp(datetime(parsed.year, 1, 1), tz=self.tz) - t2 = Timestamp(datetime(parsed.year, 12, 31, 23, 59, 59, 999999), tz=self.tz) + return (Timestamp(datetime(parsed.year, 1, 1), tz=self.tz), + Timestamp(datetime(parsed.year, 12, 31, 23, 59, 59, 999999), tz=self.tz)) elif reso == 'month': d = tslib.monthrange(parsed.year, parsed.month)[1] - t1 = Timestamp(datetime(parsed.year, parsed.month, 1), tz=self.tz) - t2 = Timestamp(datetime(parsed.year, parsed.month, d, 23, 59, 59, 999999), tz=self.tz) + return (Timestamp(datetime(parsed.year, parsed.month, 1), tz=self.tz), + Timestamp(datetime(parsed.year, parsed.month, d, 23, 59, 59, 999999), tz=self.tz)) elif reso == 'quarter': qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead d = tslib.monthrange(parsed.year, qe)[1] # at end of month - t1 = Timestamp(datetime(parsed.year, parsed.month, 1), tz=self.tz) - t2 = Timestamp(datetime(parsed.year, qe, d, 23, 59, 59, 999999), tz=self.tz) - elif (reso == 'day' and (self._resolution < Resolution.RESO_DAY or not is_monotonic)): + return (Timestamp(datetime(parsed.year, parsed.month, 1), tz=self.tz), + Timestamp(datetime(parsed.year, qe, d, 23, 59, 59, 999999), tz=self.tz)) + elif reso == 'day': st = datetime(parsed.year, parsed.month, parsed.day) - t1 = Timestamp(st, tz=self.tz) - t2 = st + offsets.Day() - t2 = Timestamp(Timestamp(t2, tz=self.tz).value - 1) - elif (reso == 'hour' and ( - self._resolution < Resolution.RESO_HR or not is_monotonic)): + return (Timestamp(st, tz=self.tz), + Timestamp(Timestamp(st + offsets.Day(), tz=self.tz).value - 1)) + elif reso == 'hour': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour) - t1 = Timestamp(st, tz=self.tz) - t2 = Timestamp(Timestamp(st + offsets.Hour(), - tz=self.tz).value - 1) - elif (reso == 'minute' and ( - self._resolution < Resolution.RESO_MIN or not is_monotonic)): + return (Timestamp(st, tz=self.tz), + Timestamp(Timestamp(st + offsets.Hour(), + tz=self.tz).value - 1)) + elif reso == 'minute': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour, minute=parsed.minute) - t1 = Timestamp(st, tz=self.tz) - t2 = Timestamp(Timestamp(st + offsets.Minute(), - tz=self.tz).value - 1) - elif (reso == 'second' and ( - self._resolution == Resolution.RESO_SEC or not is_monotonic)): + return (Timestamp(st, tz=self.tz), + Timestamp(Timestamp(st + offsets.Minute(), + tz=self.tz).value - 1)) + elif reso == 'second': st = datetime(parsed.year, parsed.month, parsed.day, hour=parsed.hour, minute=parsed.minute, second=parsed.second) - t1 = Timestamp(st, tz=self.tz) - t2 = Timestamp(Timestamp(st + offsets.Second(), - tz=self.tz).value - 1) + return (Timestamp(st, tz=self.tz), + Timestamp(Timestamp(st + offsets.Second(), + tz=self.tz).value - 1)) + elif reso == 'microsecond': + st = datetime(parsed.year, parsed.month, parsed.day, + parsed.hour, parsed.minute, parsed.second, + parsed.microsecond) + return (Timestamp(st, tz=self.tz), Timestamp(st, tz=self.tz)) else: raise KeyError + def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True): + is_monotonic = self.is_monotonic + if ((reso in ['day', 'hour', 'minute'] and + not (self._resolution < Resolution.get_reso(reso) or + not is_monotonic)) or + (reso == 'second' and + not (self._resolution <= Resolution.RESO_SEC or + not is_monotonic))): + # These resolution/monotonicity validations came from GH3931, + # GH3452 and GH2369. + raise KeyError + + if reso == 'microsecond': + # _partial_date_slice doesn't allow microsecond resolution, but + # _parsed_string_to_bounds allows it. + raise KeyError + + t1, t2 = self._parsed_string_to_bounds(reso, parsed) stamps = self.asi8 if is_monotonic: @@ -1235,6 +1267,34 @@ def get_loc(self, key): except (KeyError, ValueError): raise KeyError(key) + def _maybe_cast_slice_bound(self, label, side): + """ + If label is a string, cast it to datetime according to resolution. + + Parameters + ---------- + label : object + side : {'left', 'right'} + + Notes + ----- + Value of `side` parameter should be validated in caller. + + """ + if isinstance(label, float): + raise TypeError('Cannot index datetime64 with float keys') + if isinstance(label, time): + raise KeyError('Cannot index datetime64 with time keys') + + if isinstance(label, compat.string_types): + freq = getattr(self, 'freqstr', + getattr(self, 'inferred_freq', None)) + _, parsed, reso = parse_time_string(label, freq) + bounds = self._parsed_string_to_bounds(reso, parsed) + return bounds[0 if side == 'left' else 1] + else: + return label + def _get_string_slice(self, key, use_lhs=True, use_rhs=True): freq = getattr(self, 'freqstr', getattr(self, 'inferred_freq', None)) @@ -1245,8 +1305,21 @@ def _get_string_slice(self, key, use_lhs=True, use_rhs=True): def slice_indexer(self, start=None, end=None, step=None): """ - Index.slice_indexer, customized to handle time slicing + Return indexer for specified label slice. + Index.slice_indexer, customized to handle time slicing. + + In addition to functionality provided by Index.slice_indexer, does the + following: + + - if both `start` and `end` are instances of `datetime.time`, it + invokes `indexer_between_time` + - if `start` and `end` are both either string or None perform + value-based selection in non-monotonic cases. + """ + # For historical reasons DatetimeIndex supports slices between two + # instances of datetime.time as if it were applying a slice mask to + # an array of (self.hour, self.minute, self.seconds, self.microsecond). if isinstance(start, time) and isinstance(end, time): if step is not None and step != 1: raise ValueError('Must have step size of 1 with time slices') @@ -1255,10 +1328,30 @@ def slice_indexer(self, start=None, end=None, step=None): if isinstance(start, time) or isinstance(end, time): raise KeyError('Cannot mix time and non-time slice keys') - if isinstance(start, float) or isinstance(end, float): - raise TypeError('Cannot index datetime64 with float keys') - - return Index.slice_indexer(self, start, end, step) + try: + return Index.slice_indexer(self, start, end, step) + except KeyError: + # For historical reasons DatetimeIndex by default supports + # value-based partial (aka string) slices on non-monotonic arrays, + # let's try that. + if ((start is None or isinstance(start, compat.string_types)) and + (end is None or isinstance(end, compat.string_types))): + mask = True + if start is not None: + start_casted = self._maybe_cast_slice_bound(start, 'left') + mask = start_casted <= self + + if end is not None: + end_casted = self._maybe_cast_slice_bound(end, 'right') + mask = (self <= end_casted) & mask + + indexer = mask.nonzero()[0][::step] + if len(indexer) == len(self): + return slice(None) + else: + return indexer + else: + raise def __getitem__(self, key): getitem = self._data.__getitem__ diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 0b4ca5014e76b..fbea7a3e1af67 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -783,7 +783,11 @@ def astype(self, dtype): raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype) def searchsorted(self, key, side='left'): - if isinstance(key, compat.string_types): + if isinstance(key, Period): + if key.freq != self.freq: + raise ValueError("Different period frequency: %s" % key.freq) + key = key.ordinal + elif isinstance(key, compat.string_types): key = Period(key, freq=self.freq).ordinal return self.values.searchsorted(key, side=side) @@ -982,6 +986,9 @@ def get_loc(self, key): try: return self._engine.get_loc(key) except KeyError: + if com.is_integer(key): + raise + try: asdt, parsed, reso = parse_time_string(key, self.freq) key = asdt @@ -994,47 +1001,38 @@ def get_loc(self, key): except KeyError: raise KeyError(key) - def slice_locs(self, start=None, end=None): - """ - Index.slice_locs, customized to handle partial ISO-8601 string slicing + def _maybe_cast_slice_bound(self, label, side): """ - if isinstance(start, compat.string_types) or isinstance(end, compat.string_types): - try: - if start: - start_loc = self._get_string_slice(start).start - else: - start_loc = 0 - - if end: - end_loc = self._get_string_slice(end).stop - else: - end_loc = len(self) - - return start_loc, end_loc - except KeyError: - pass - - if isinstance(start, datetime) and isinstance(end, datetime): - ordinals = self.values - t1 = Period(start, freq=self.freq) - t2 = Period(end, freq=self.freq) + If label is a string or a datetime, cast it to Period.ordinal according to + resolution. - left = ordinals.searchsorted(t1.ordinal, side='left') - right = ordinals.searchsorted(t2.ordinal, side='right') - return left, right + Parameters + ---------- + label : object + side : {'left', 'right'} - return Int64Index.slice_locs(self, start, end) + Returns + ------- + bound : Period or object - def _get_string_slice(self, key): - if not self.is_monotonic: - raise ValueError('Partial indexing only valid for ' - 'ordered time series') + Notes + ----- + Value of `side` parameter should be validated in caller. - key, parsed, reso = parse_time_string(key, self.freq) + """ + if isinstance(label, datetime): + return Period(label, freq=self.freq) + elif isinstance(label, compat.string_types): + try: + _, parsed, reso = parse_time_string(label, self.freq) + bounds = self._parsed_string_to_bounds(reso, parsed) + return bounds[0 if side == 'left' else 1] + except Exception: + raise KeyError(label) - grp = frequencies._infer_period_group(reso) - freqn = frequencies._period_group(self.freq) + return label + def _parsed_string_to_bounds(self, reso, parsed): if reso == 'year': t1 = Period(year=parsed.year, freq='A') elif reso == 'month': @@ -1042,30 +1040,39 @@ def _get_string_slice(self, key): elif reso == 'quarter': q = (parsed.month - 1) // 3 + 1 t1 = Period(year=parsed.year, quarter=q, freq='Q-DEC') - elif reso == 'day' and grp < freqn: + elif reso == 'day': t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, freq='D') - elif reso == 'hour' and grp < freqn: + elif reso == 'hour': t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, hour=parsed.hour, freq='H') - elif reso == 'minute' and grp < freqn: + elif reso == 'minute': t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, hour=parsed.hour, minute=parsed.minute, freq='T') - elif reso == 'second' and grp < freqn: + elif reso == 'second': t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, hour=parsed.hour, minute=parsed.minute, second=parsed.second, freq='S') else: raise KeyError(key) + return (t1.asfreq(self.freq, how='start'), + t1.asfreq(self.freq, how='end')) + + def _get_string_slice(self, key): + if not self.is_monotonic: + raise ValueError('Partial indexing only valid for ' + 'ordered time series') - ordinals = self.values + key, parsed, reso = parse_time_string(key, self.freq) - t2 = t1.asfreq(self.freq, how='end') - t1 = t1.asfreq(self.freq, how='start') + grp = frequencies._infer_period_group(reso) + freqn = frequencies._period_group(self.freq) + if reso in ['day', 'hour', 'minute', 'second'] and not grp < freqn: + raise KeyError(key) - left = ordinals.searchsorted(t1.ordinal, side='left') - right = ordinals.searchsorted(t2.ordinal, side='right') - return slice(left, right) + t1, t2 = self._parsed_string_to_bounds(reso, parsed) + return slice(self.searchsorted(t1.ordinal, side='left'), + self.searchsorted(t2.ordinal, side='right')) def join(self, other, how='left', level=None, return_indexers=False): """ diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 0d99cd16d8c99..7fb897aecc809 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -76,6 +76,7 @@ def wrapper(self, other): return wrapper + class TimedeltaIndex(DatetimeIndexOpsMixin, Int64Index): """ Immutable ndarray of timedelta64 data, represented internally as int64, and @@ -705,6 +706,31 @@ def get_loc(self, key): except (KeyError, ValueError): raise KeyError(key) + def _maybe_cast_slice_bound(self, label, side): + """ + If label is a string, cast it to timedelta according to resolution. + + + Parameters + ---------- + label : object + side : {'left', 'right'} + + Returns + ------- + bound : Timedelta or object + + """ + if isinstance(label, compat.string_types): + parsed = _coerce_scalar_to_timedelta_type(label, box=True) + lbound = parsed.round(parsed.resolution) + if side == 'left': + return lbound + else: + return (lbound + _resolution_map[parsed.resolution]() - + Timedelta(1, 'ns')) + return label + def _get_string_slice(self, key, use_lhs=True, use_rhs=True): freq = getattr(self, 'freqstr', getattr(self, 'inferred_freq', None)) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index e046d687435e7..1fd2d7b8fa8e5 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1352,7 +1352,9 @@ def test_getitem_partial(self): assert_series_equal(exp, result) ts = ts[10:].append(ts[10:]) - self.assertRaises(ValueError, ts.__getitem__, slice('2008', '2009')) + self.assertRaisesRegexp( + KeyError, "left slice bound for non-unique label: '2008'", + ts.__getitem__, slice('2008', '2009')) def test_getitem_datetime(self): rng = period_range(start='2012-01-01', periods=10, freq='W-MON') @@ -1364,6 +1366,39 @@ def test_getitem_datetime(self): rs = ts[dt1:dt4] assert_series_equal(rs, ts) + def test_slice_with_negative_step(self): + ts = Series(np.arange(20), + period_range('2014-01', periods=20, freq='M')) + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + assert_series_equal(ts[l_slc], ts.iloc[i_slc]) + assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + assert_series_equal(ts.ix[l_slc], ts.iloc[i_slc]) + + assert_slices_equivalent(SLC[Period('2014-10')::-1], SLC[9::-1]) + assert_slices_equivalent(SLC['2014-10'::-1], SLC[9::-1]) + + assert_slices_equivalent(SLC[:Period('2014-10'):-1], SLC[:8:-1]) + assert_slices_equivalent(SLC[:'2014-10':-1], SLC[:8:-1]) + + assert_slices_equivalent(SLC['2015-02':'2014-10':-1], SLC[13:8:-1]) + assert_slices_equivalent(SLC[Period('2015-02'):Period('2014-10'):-1], SLC[13:8:-1]) + assert_slices_equivalent(SLC['2015-02':Period('2014-10'):-1], SLC[13:8:-1]) + assert_slices_equivalent(SLC[Period('2015-02'):'2014-10':-1], SLC[13:8:-1]) + + assert_slices_equivalent(SLC['2014-10':'2015-02':-1], SLC[:0]) + + def test_slice_with_zero_step_raises(self): + ts = Series(np.arange(20), + period_range('2014-01', periods=20, freq='M')) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts[::0]) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts.loc[::0]) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts.ix[::0]) + def test_sub(self): rng = period_range('2007-01', periods=50) @@ -2464,6 +2499,13 @@ def test_combine_first(self): expected = pd.Series([1, 9, 9, 4, 5, 9, 7], index=idx, dtype=np.float64) tm.assert_series_equal(result, expected) + def test_searchsorted(self): + pidx = pd.period_range('2014-01-01', periods=10, freq='D') + self.assertEqual( + pidx.searchsorted(pd.Period('2014-01-01', freq='D')), 0) + self.assertRaisesRegexp( + ValueError, 'Different period frequency: H', + lambda: pidx.searchsorted(pd.Period('2014-01-01', freq='H'))) def _permute(obj): return obj.take(np.random.permutation(len(obj))) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index c8dd5573370d9..9ad2a090ee0cf 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1232,6 +1232,40 @@ def test_partial_slice_high_reso(self): result = s['1 days, 10:11:12.001001'] self.assertEqual(result, s.irow(1001)) + def test_slice_with_negative_step(self): + ts = Series(np.arange(20), + timedelta_range('0', periods=20, freq='H')) + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + assert_series_equal(ts[l_slc], ts.iloc[i_slc]) + assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + assert_series_equal(ts.ix[l_slc], ts.iloc[i_slc]) + + assert_slices_equivalent(SLC[Timedelta(hours=7)::-1], SLC[7::-1]) + assert_slices_equivalent(SLC['7 hours'::-1], SLC[7::-1]) + + assert_slices_equivalent(SLC[:Timedelta(hours=7):-1], SLC[:6:-1]) + assert_slices_equivalent(SLC[:'7 hours':-1], SLC[:6:-1]) + + assert_slices_equivalent(SLC['15 hours':'7 hours':-1], SLC[15:6:-1]) + assert_slices_equivalent(SLC[Timedelta(hours=15):Timedelta(hours=7):-1], SLC[15:6:-1]) + assert_slices_equivalent(SLC['15 hours':Timedelta(hours=7):-1], SLC[15:6:-1]) + assert_slices_equivalent(SLC[Timedelta(hours=15):'7 hours':-1], SLC[15:6:-1]) + + assert_slices_equivalent(SLC['7 hours':'15 hours':-1], SLC[:0]) + + def test_slice_with_zero_step_raises(self): + ts = Series(np.arange(20), + timedelta_range('0', periods=20, freq='H')) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts[::0]) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts.loc[::0]) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts.ix[::0]) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index e6b4bf23e806f..436f9f3b9c9b3 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -198,7 +198,6 @@ def test_indexing_over_size_cutoff(self): _index._SIZE_CUTOFF = old_cutoff def test_indexing_unordered(self): - # GH 2437 rng = date_range(start='2011-01-01', end='2011-01-15') ts = Series(randn(len(rng)), index=rng) @@ -2767,6 +2766,41 @@ def test_factorize(self): self.assertTrue(idx.equals(idx3)) + def test_slice_with_negative_step(self): + ts = Series(np.arange(20), + date_range('2014-01-01', periods=20, freq='MS')) + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + assert_series_equal(ts[l_slc], ts.iloc[i_slc]) + assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + assert_series_equal(ts.ix[l_slc], ts.iloc[i_slc]) + + assert_slices_equivalent(SLC[Timestamp('2014-10-01')::-1], SLC[9::-1]) + assert_slices_equivalent(SLC['2014-10-01'::-1], SLC[9::-1]) + + assert_slices_equivalent(SLC[:Timestamp('2014-10-01'):-1], SLC[:8:-1]) + assert_slices_equivalent(SLC[:'2014-10-01':-1], SLC[:8:-1]) + + assert_slices_equivalent(SLC['2015-02-01':'2014-10-01':-1], SLC[13:8:-1]) + assert_slices_equivalent(SLC[Timestamp('2015-02-01'):Timestamp('2014-10-01'):-1], SLC[13:8:-1]) + assert_slices_equivalent(SLC['2015-02-01':Timestamp('2014-10-01'):-1], SLC[13:8:-1]) + assert_slices_equivalent(SLC[Timestamp('2015-02-01'):'2014-10-01':-1], SLC[13:8:-1]) + + assert_slices_equivalent(SLC['2014-10-01':'2015-02-01':-1], SLC[:0]) + + def test_slice_with_zero_step_raises(self): + ts = Series(np.arange(20), + date_range('2014-01-01', periods=20, freq='MS')) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts[::0]) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts.loc[::0]) + self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', + lambda: ts.ix[::0]) + + + class TestDatetime64(tm.TestCase): """ Also test support for datetime64[ns] in Series / DataFrame @@ -3745,6 +3779,22 @@ def test_partial_slice_minutely(self): self.assertEqual(s[Timestamp('2005-1-1 23:59:00')], s.ix[0]) self.assertRaises(Exception, s.__getitem__, '2004-12-31 00:00:00') + def test_partial_slice_second_precision(self): + rng = DatetimeIndex(start=datetime(2005, 1, 1, 0, 0, 59, + microsecond=999990), + periods=20, freq='US') + s = Series(np.arange(20), rng) + + assert_series_equal(s['2005-1-1 00:00'], s.iloc[:10]) + assert_series_equal(s['2005-1-1 00:00:59'], s.iloc[:10]) + + assert_series_equal(s['2005-1-1 00:01'], s.iloc[10:]) + assert_series_equal(s['2005-1-1 00:01:00'], s.iloc[10:]) + + self.assertEqual(s[Timestamp('2005-1-1 00:00:59.999990')], s.iloc[0]) + self.assertRaisesRegexp(KeyError, '2005-1-1 00:00:00', + lambda: s['2005-1-1 00:00:00']) + def test_partial_slicing_with_multiindex(self): # GH 4758 @@ -3955,6 +4005,24 @@ def test_date_range_fy5252(self): self.assertEqual(dr[0], Timestamp('2013-01-31')) self.assertEqual(dr[1], Timestamp('2014-01-30')) + def test_partial_slice_doesnt_require_monotonicity(self): + # For historical reasons. + s = pd.Series(np.arange(10), + pd.date_range('2014-01-01', periods=10)) + + nonmonotonic = s[[3, 5, 4]] + expected = nonmonotonic.iloc[:0] + timestamp = pd.Timestamp('2014-01-10') + + assert_series_equal(nonmonotonic['2014-01-10':], expected) + self.assertRaisesRegexp(KeyError, "Timestamp\('2014-01-10 00:00:00'\)", + lambda: nonmonotonic[timestamp:]) + + assert_series_equal(nonmonotonic.ix['2014-01-10':], expected) + self.assertRaisesRegexp(KeyError, "Timestamp\('2014-01-10 00:00:00'\)", + lambda: nonmonotonic.ix[timestamp:]) + + class TimeConversionFormats(tm.TestCase): def test_to_datetime_format(self): values = ['1/1/2000', '1/2/2000', '1/3/2000']