From 8f3ed7fdf84fd3adc01b663d9f664310f332dd63 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 20 May 2013 15:16:43 -0400 Subject: [PATCH 1/2] BUG: Non-unique indexing via ``loc`` and friends fixed (GH3659_) BUG: deal with non_monotonic indices CLN: convert slice_locs arrays to sliced ranges if possible --- RELEASE.rst | 2 ++ pandas/core/index.py | 66 ++++++++++++++++++++++++++++++++--- pandas/core/indexing.py | 1 + pandas/tests/test_indexing.py | 24 +++++++++++++ 4 files changed, 89 insertions(+), 4 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 9b3cc3683c3de..e02ad66252bdc 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -115,6 +115,7 @@ pandas 0.11.1 and handle missing elements like unique indices (GH3561_) - Duplicate indexes with and empty DataFrame.from_records will return a correct frame (GH3562_) - Concat to produce a non-unique columns when duplicates are across dtypes is fixed (GH3602_) + - Non-unique indexing with a slice via ``loc`` and friends fixed (GH3659_) - Fixed bug in groupby with empty series referencing a variable before assignment. (GH3510_) - Fixed bug in mixed-frame assignment with aligned series (GH3492_) - Fixed bug in selecting month/quarter/year from a series would not select the time element @@ -215,6 +216,7 @@ pandas 0.11.1 .. _GH3638: https://github.com/pydata/pandas/issues/3638 .. _GH3605: https://github.com/pydata/pandas/issues/3605 .. _GH3606: https://github.com/pydata/pandas/issues/3606 +.. _GH3659: https://github.com/pydata/pandas/issues/3659 .. _Gh3616: https://github.com/pydata/pandas/issues/3616 pandas 0.11.0 diff --git a/pandas/core/index.py b/pandas/core/index.py index 3e5a4f5676437..cad1186c6addf 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1219,13 +1219,39 @@ def slice_locs(self, start=None, end=None): ----- This function assumes that the data is sorted, so use at your own peril """ + + is_unique = self.is_unique if start is None: - start_slice = 0 + if is_unique: + start_slice = 0 + else: + start_slice = np.arange(len(self)) else: try: start_slice = self.get_loc(start) - if isinstance(start_slice, slice): + + if not is_unique: + + # get_loc will return a boolean array for non_uniques + # if we are not monotonic + if isinstance(start_slice,np.ndarray): + if not self.is_monotonic: + raise KeyError("cannot peform a slice operation " + "on a non-unique non-monotonic index") + start_slice = np.arange(len(self))[start_slice] + + # select all in the slice + all the rest of the entries + # to the right + elif isinstance(start_slice, slice): + ss = np.arange(start_slice.stop,len(self)) + start_slice = np.arange(len(self))[start_slice] + start_slice = (Index(ss) | Index(start_slice)).values + else: + start_slice = np.arange(start_slice,len(self)) + + elif isinstance(start_slice, slice): start_slice = start_slice.start + except KeyError: if self.is_monotonic: start_slice = self.searchsorted(start, side='left') @@ -1233,20 +1259,52 @@ def slice_locs(self, start=None, end=None): raise if end is None: - end_slice = len(self) + if is_unique: + end_slice = len(self) + else: + end_slice = np.arange(len(self)) else: try: end_slice = self.get_loc(end) - if isinstance(end_slice, slice): + + if not is_unique: + + # get_loc will return a boolean array for non_uniques + if isinstance(end_slice,np.ndarray): + if not self.is_monotonic: + raise KeyError("cannot perform a slice operation " + "on a non-unique non-monotonic index") + end_slice = np.arange(len(self))[end_slice] + + # select all in the slice + all to the left of the entries + elif isinstance(end_slice, slice): + es = np.arange(0,end_slice.start) + end_slice = np.arange(len(self))[end_slice] + end_slice = (Index(es) | Index(end_slice)).values + else: + end_slice = np.arange(0,end_slice+1) + + elif isinstance(end_slice, slice): end_slice = end_slice.stop else: end_slice += 1 + except KeyError: if self.is_monotonic: end_slice = self.searchsorted(end, side='right') else: raise + if not is_unique: + # see if we can convert back to and edge slice + if len(start_slice) == len(end_slice) and (start_slice == end_slice).all(): + start_slice, end_slice = start_slice[0], start_slice[-1]+1 + # partial slice + elif (len(start_slice) == start_slice[-1]-start_slice[0]+1) and ( + len(end_slice) == end_slice[-1]-end_slice[0]+1): + res = (Index(start_slice) & Index(end_slice)).values + start_slice, end_slice = res[0],res[-1]+1 + return start_slice, end_slice def delete(self, loc): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ea684ef11446c..41f20cbcc15ac 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -759,6 +759,7 @@ def _getitem_axis(self, key, axis=0): labels = self.obj._get_axis(axis) if isinstance(key, slice): + self._has_valid_type(key,axis) return self._get_slice_axis(key, axis=axis) elif com._is_bool_indexer(key): return self._getbool_axis(key, axis=axis) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index e9afa1ae6ec1d..5891e8ac08040 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -953,6 +953,30 @@ def test_iloc_mask(self): (key,ans,r)) warnings.filterwarnings(action='always', category=UserWarning) + def test_non_unique_loc(self): + ## GH3659 + ## non-unique indexer with loc slice + ## https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs + + # these are going to raise becuase the we are non monotonic + df = DataFrame({'A' : [1,2,3,4,5,6], 'B' : [3,4,5,6,7,8]}, index = [0,1,0,1,2,3]) + self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1,None)])) + self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(0,None)])) + self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1,2)])) + + # monotonic are ok + df = DataFrame({'A' : [1,2,3,4,5,6], 'B' : [3,4,5,6,7,8]}, index = [0,1,0,1,2,3]).sort(axis=0) + result = df.loc[1:] + expected = DataFrame({'A' : [2,4,5,6], 'B' : [4, 6,7,8]}, index = [1,1,2,3]) + assert_frame_equal(result,expected) + + result = df.loc[0:] + assert_frame_equal(result,df) + + result = df.loc[1:2] + expected = DataFrame({'A' : [2,4,5], 'B' : [4,6,7]}, index = [1,1,2]) + assert_frame_equal(result,expected) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From c47bc50bef5b083f0aac64df5338c3fc642b2ab5 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 20 May 2013 17:45:26 -0400 Subject: [PATCH 2/2] CLN: did not need to convert to index array/slicer as the only time this happens is when a boolean array comes back from get_loc, means the index is non_monotonic, which is an exception in any event --- pandas/core/index.py | 53 +++++++------------------------------------- 1 file changed, 8 insertions(+), 45 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index cad1186c6addf..3a6913a924c1d 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1222,10 +1222,7 @@ def slice_locs(self, start=None, end=None): is_unique = self.is_unique if start is None: - if is_unique: - start_slice = 0 - else: - start_slice = np.arange(len(self)) + start_slice = 0 else: try: start_slice = self.get_loc(start) @@ -1235,21 +1232,10 @@ def slice_locs(self, start=None, end=None): # get_loc will return a boolean array for non_uniques # if we are not monotonic if isinstance(start_slice,np.ndarray): - if not self.is_monotonic: - raise KeyError("cannot peform a slice operation " - "on a non-unique non-monotonic index") - start_slice = np.arange(len(self))[start_slice] - - # select all in the slice + all the rest of the entries - # to the right - elif isinstance(start_slice, slice): - ss = np.arange(start_slice.stop,len(self)) - start_slice = np.arange(len(self))[start_slice] - start_slice = (Index(ss) | Index(start_slice)).values - else: - start_slice = np.arange(start_slice,len(self)) + raise KeyError("cannot peform a slice operation " + "on a non-unique non-monotonic index") - elif isinstance(start_slice, slice): + if isinstance(start_slice, slice): start_slice = start_slice.start except KeyError: @@ -1259,10 +1245,7 @@ def slice_locs(self, start=None, end=None): raise if end is None: - if is_unique: - end_slice = len(self) - else: - end_slice = np.arange(len(self)) + end_slice = len(self) else: try: end_slice = self.get_loc(end) @@ -1271,20 +1254,10 @@ def slice_locs(self, start=None, end=None): # get_loc will return a boolean array for non_uniques if isinstance(end_slice,np.ndarray): - if not self.is_monotonic: - raise KeyError("cannot perform a slice operation " - "on a non-unique non-monotonic index") - end_slice = np.arange(len(self))[end_slice] - - # select all in the slice + all to the left of the entries - elif isinstance(end_slice, slice): - es = np.arange(0,end_slice.start) - end_slice = np.arange(len(self))[end_slice] - end_slice = (Index(es) | Index(end_slice)).values - else: - end_slice = np.arange(0,end_slice+1) + raise KeyError("cannot perform a slice operation " + "on a non-unique non-monotonic index") - elif isinstance(end_slice, slice): + if isinstance(end_slice, slice): end_slice = end_slice.stop else: end_slice += 1 @@ -1295,16 +1268,6 @@ def slice_locs(self, start=None, end=None): else: raise - if not is_unique: - # see if we can convert back to and edge slice - if len(start_slice) == len(end_slice) and (start_slice == end_slice).all(): - start_slice, end_slice = start_slice[0], start_slice[-1]+1 - # partial slice - elif (len(start_slice) == start_slice[-1]-start_slice[0]+1) and ( - len(end_slice) == end_slice[-1]-end_slice[0]+1): - res = (Index(start_slice) & Index(end_slice)).values - start_slice, end_slice = res[0],res[-1]+1 - return start_slice, end_slice def delete(self, loc):