From ffc90d68e1de6764bb969c6c2744dc6ffd0ab818 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 10 May 2013 10:40:54 -0400 Subject: [PATCH 1/2] BUG: non-unique indexers with a list-like now return in the same order as the passed values --- RELEASE.rst | 1 + doc/source/indexing.rst | 3 +++ pandas/core/index.py | 13 +++++++++++++ pandas/core/indexing.py | 19 +++---------------- pandas/index.pyx | 29 +++++++++++++++++++++++++++++ pandas/tests/test_frame.py | 24 ++++++++++++++++++++++-- pandas/tests/test_indexing.py | 10 ++++++++++ 7 files changed, 81 insertions(+), 18 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 31627cec01d1e..84c1bcf974549 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -148,6 +148,7 @@ pandas 0.11.1 .. _GH3552: https://github.com/pydata/pandas/issues/3552 .. _GH3562: https://github.com/pydata/pandas/issues/3562 .. _GH3586: https://github.com/pydata/pandas/issues/3586 +.. _GH3561: https://github.com/pydata/pandas/issues/3561 .. _GH3493: https://github.com/pydata/pandas/issues/3493 .. _GH3579: https://github.com/pydata/pandas/issues/3579 .. _GH3593: https://github.com/pydata/pandas/issues/3593 diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index d67a2d51cc1b8..55b7e653c3630 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1368,6 +1368,9 @@ incompatible the new object internals are with the ``Index`` functions): - ``slice_locs``: returns the "range" to slice between two labels - ``get_indexer``: Computes the indexing vector for reindexing / data alignment purposes. See the source / docstrings for more on this + - ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data + alignment purposes when the index is non-unique. See the source / docstrings + for more on this - ``reindex``: Does any pre-conversion of the input index then calls ``get_indexer`` - ``union``, ``intersection``: computes the union or intersection of two diff --git a/pandas/core/index.py b/pandas/core/index.py index 7baae543714ec..4bf53c00d0f61 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -859,6 +859,19 @@ def get_indexer(self, target, method=None, limit=None): return com._ensure_platform_int(indexer) + def get_indexer_non_unique(self, target, **kwargs): + """ return an indexer suitable for takng from a non unique index + return the labels in the same order ast the target, + target must be an iterable """ + target = _ensure_index(target) + pself, ptarget = self._possibly_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer_non_unique(ptarget) + + if self.is_all_dates: + return Index(Index(self.asi8)._engine.get_indexer_non_unique(target.asi8)) + return Index(self._engine.get_indexer_non_unique(target.values)) + def _possibly_promote(self, other): # A hack, but it works from pandas.tseries.index import DatetimeIndex diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index bc8b7a3646a33..fd2c89b019a2f 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -458,8 +458,8 @@ def _reindex(keys, level=None): if labels.is_unique: return _reindex(keyarr, level=level) else: - mask = labels.isin(keyarr) - return self.obj.take(mask.nonzero()[0], axis=axis, convert=False) + indexer = labels.get_indexer_non_unique(keyarr) + return self.obj.take(indexer, axis=axis, convert=False) def _convert_to_indexer(self, obj, axis=0): """ @@ -569,20 +569,7 @@ def _convert_to_indexer(self, obj, axis=0): # non-unique (dups) else: - indexer = [] - check = np.arange(len(labels)) - lvalues = labels.values - for x in objarr: - # ugh - to_or = lib.map_infer(lvalues, x.__eq__) - if not to_or.any(): - raise KeyError('%s not in index' % str(x)) - - # add the indicies (as we want to take) - indexer.extend(check[to_or]) - - indexer = Index(indexer) - + indexer = check = labels.get_indexer_non_unique(objarr) mask = check == -1 if mask.any(): diff --git a/pandas/index.pyx b/pandas/index.pyx index 2ad5474549ec6..bd1bef9408717 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -267,8 +267,37 @@ cdef class IndexEngine: self._ensure_mapping_populated() return self.mapping.lookup(values) + def get_indexer_non_unique(self, targets): + """ return an indexer suitable for takng from a non unique index + return the labels in the same order ast the target """ + cdef: + ndarray values + ndarray[int64_t] result + object v, val + int count = 0 + Py_ssize_t i, j, n + + self._ensure_mapping_populated() + values = self._get_index_values() + n = len(values) + n_t = len(targets) + result = np.empty(n, dtype=np.int64) + + for i in range(n_t): + val = util.get_value_at(targets, i) + + for j in range(n): + v = util.get_value_at(values, j) + + if v == val: + result[count] = j + count += 1 + + if count == 0: + raise KeyError + return result[0:count] cdef class Int64Engine(IndexEngine): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index ce89dda63597f..064731319da85 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4668,8 +4668,28 @@ def _check_df(df,cols=None): with ensure_clean() as path: df.to_csv(path,cols = cols,chunksize=chunksize) rs_c = pd.read_csv(path,index_col=0) - rs_c.columns = df.columns - assert_frame_equal(df,rs_c,check_names=False) + + # we wrote them in a different order + # so compare them in that order + if cols is not None: + + if df.columns.is_unique: + rs_c.columns = cols + else: + rs_c.columns = df.columns.take(df.columns.get_indexer_non_unique(cols)) + + for c in cols: + obj_df = df[c] + obj_rs = rs_c[c] + if isinstance(obj_df,Series): + assert_series_equal(obj_df,obj_rs) + else: + assert_frame_equal(obj_df,obj_rs,check_names=False) + + # wrote in the same order + else: + rs_c.columns = df.columns + assert_frame_equal(df,rs_c,check_names=False) chunksize=5 N = int(chunksize*2.5) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 01651f2674a90..0b51416ced2b5 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -784,6 +784,16 @@ def test_dups_fancy_indexing(self): assert_frame_equal(df,result) + # GH 3561, dups not in selected order + ind = ['A', 'A', 'B', 'C'] + df = DataFrame({'test':range(len(ind))}, index=ind) + rows = ['C', 'B'] + res = df.ix[rows] + self.assert_(rows == list(res.index)) + + res = df.ix[Index(rows)] + self.assert_(Index(rows).equals(res.index)) + def test_indexing_mixed_frame_bug(self): # GH3492 From b84d649354265d03c0ff0fdfc8fb1c15ebbf7bae Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 10 May 2013 11:56:01 -0400 Subject: [PATCH 2/2] BUG: handle missing indexers in duplicate indices similary to how unique handles (e.g. by reindexing) --- RELEASE.rst | 1 + pandas/core/index.py | 16 +++++++++++----- pandas/core/indexing.py | 22 +++++++++++++++++++--- pandas/index.pyx | 25 +++++++++++++++++-------- pandas/lib.pyx | 19 +++++++++++++++++++ pandas/tests/test_frame.py | 3 ++- pandas/tests/test_indexing.py | 12 ++++++++++++ 7 files changed, 81 insertions(+), 17 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 84c1bcf974549..4e6570669656d 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -91,6 +91,7 @@ pandas 0.11.1 (removed warning) (GH2786_), and fix (GH3230_) - Fix to_csv to handle non-unique columns (GH3495_) - Duplicate indexes with getitem will return items in the correct order (GH3455_, GH3457_) + and handle missing elements like unique indices (GH3561_) - Duplicate indexes with and empty DataFrame.from_records will return a correct frame (GH3562_) - Fixed bug in groupby with empty series referencing a variable before assignment. (GH3510_) - Fixed bug in mixed-frame assignment with aligned series (GH3492_) diff --git a/pandas/core/index.py b/pandas/core/index.py index 4bf53c00d0f61..3e5a4f5676437 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -860,17 +860,23 @@ def get_indexer(self, target, method=None, limit=None): return com._ensure_platform_int(indexer) def get_indexer_non_unique(self, target, **kwargs): - """ return an indexer suitable for takng from a non unique index - return the labels in the same order ast the target, - target must be an iterable """ + """ return an indexer suitable for taking from a non unique index + return the labels in the same order as the target, and + return a missing indexer into the target (missing are marked as -1 + in the indexer); target must be an iterable """ target = _ensure_index(target) pself, ptarget = self._possibly_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) if self.is_all_dates: - return Index(Index(self.asi8)._engine.get_indexer_non_unique(target.asi8)) - return Index(self._engine.get_indexer_non_unique(target.values)) + self = Index(self.asi8) + tgt_values = target.asi8 + else: + tgt_values = target.values + + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + return Index(indexer), missing def _possibly_promote(self, other): # A hack, but it works diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index fd2c89b019a2f..29adce4e02591 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -458,8 +458,23 @@ def _reindex(keys, level=None): if labels.is_unique: return _reindex(keyarr, level=level) else: - indexer = labels.get_indexer_non_unique(keyarr) - return self.obj.take(indexer, axis=axis, convert=False) + indexer, missing = labels.get_indexer_non_unique(keyarr) + check = indexer != -1 + result = self.obj.take(indexer[check], axis=axis, convert=False) + + # need to merge the result labels and the missing labels + if len(missing): + l = np.arange(len(indexer)) + + missing_labels = keyarr.take(missing) + missing_labels_indexer = l[~check] + cur_labels = result._get_axis(axis).values + cur_labels_indexer = l[check] + new_labels = lib.combine_from_indexers(cur_labels, cur_labels_indexer, + missing_labels, missing_labels_indexer) + result = result.reindex_axis(new_labels,axis=axis) + + return result def _convert_to_indexer(self, obj, axis=0): """ @@ -569,7 +584,8 @@ def _convert_to_indexer(self, obj, axis=0): # non-unique (dups) else: - indexer = check = labels.get_indexer_non_unique(objarr) + indexer, missing = labels.get_indexer_non_unique(objarr) + check = indexer mask = check == -1 if mask.any(): diff --git a/pandas/index.pyx b/pandas/index.pyx index bd1bef9408717..7d33d6083d0eb 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -269,23 +269,27 @@ cdef class IndexEngine: def get_indexer_non_unique(self, targets): """ return an indexer suitable for takng from a non unique index - return the labels in the same order ast the target """ + return the labels in the same order ast the target + and a missing indexer into the targets (which correspond + to the -1 indicies in the results """ cdef: ndarray values - ndarray[int64_t] result + ndarray[int64_t] result, missing object v, val - int count = 0 - Py_ssize_t i, j, n + int count = 0, count_missing = 0 + Py_ssize_t i, j, n, found self._ensure_mapping_populated() values = self._get_index_values() n = len(values) n_t = len(targets) - result = np.empty(n, dtype=np.int64) + result = np.empty(n+n_t, dtype=np.int64) + missing = np.empty(n_t, dtype=np.int64) for i in range(n_t): val = util.get_value_at(targets, i) + found = 0 for j in range(n): v = util.get_value_at(values, j) @@ -293,11 +297,16 @@ cdef class IndexEngine: if v == val: result[count] = j count += 1 + found = 1 - if count == 0: - raise KeyError + # value not found + if found == 0: + result[count] = -1 + count += 1 + missing[count_missing] = i + count_missing += 1 - return result[0:count] + return result[0:count], missing[0:count_missing] cdef class Int64Engine(IndexEngine): diff --git a/pandas/lib.pyx b/pandas/lib.pyx index d043691bc061e..30c65d9fcdd9f 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -416,6 +416,25 @@ def dicts_to_array(list dicts, list columns): return result +@cython.wraparound(False) +@cython.boundscheck(False) +def combine_from_indexers(ndarray a, ndarray[int64_t] a_indexer, + ndarray b, ndarray[int64_t] b_indexer): + cdef: + Py_ssize_t i, n_a, n_b + ndarray result + + n_a = len(a) + n_b = len(b) + result = np.empty(n_a+n_b,dtype=object) + + for i in range(n_a): + result[a_indexer[i]] = a[i] + for i in range(n_b): + result[b_indexer[i]] = b[i] + + return result + def fast_zip(list ndarrays): ''' diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 064731319da85..e92cc22dccaf6 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4676,7 +4676,8 @@ def _check_df(df,cols=None): if df.columns.is_unique: rs_c.columns = cols else: - rs_c.columns = df.columns.take(df.columns.get_indexer_non_unique(cols)) + indexer, missing = df.columns.get_indexer_non_unique(cols) + rs_c.columns = df.columns.take(indexer) for c in cols: obj_df = df[c] diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 0b51416ced2b5..46fd98fc14ffb 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -794,6 +794,18 @@ def test_dups_fancy_indexing(self): res = df.ix[Index(rows)] self.assert_(Index(rows).equals(res.index)) + rows = ['C','B','E'] + res = df.ix[rows] + self.assert_(rows == list(res.index)) + + # inconcistent returns for unique/duplicate indices when values are missing + df = DataFrame(randn(4,3),index=list('ABCD')) + expected = df.ix[['E']] + + dfnu = DataFrame(randn(5,3),index=list('AABCD')) + result = dfnu.ix[['E']] + assert_frame_equal(result, expected) + def test_indexing_mixed_frame_bug(self): # GH3492