From ffc90d68e1de6764bb969c6c2744dc6ffd0ab818 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Fri, 10 May 2013 10:40:54 -0400
Subject: [PATCH 1/2] BUG: non-unique indexers with a list-like now return in
 the same order as the passed values

---
 RELEASE.rst                   |  1 +
 doc/source/indexing.rst       |  3 +++
 pandas/core/index.py          | 13 +++++++++++++
 pandas/core/indexing.py       | 19 +++----------------
 pandas/index.pyx              | 29 +++++++++++++++++++++++++++++
 pandas/tests/test_frame.py    | 24 ++++++++++++++++++++++--
 pandas/tests/test_indexing.py | 10 ++++++++++
 7 files changed, 81 insertions(+), 18 deletions(-)

diff --git a/RELEASE.rst b/RELEASE.rst
index 31627cec01d1e..84c1bcf974549 100644
--- a/RELEASE.rst
+++ b/RELEASE.rst
@@ -148,6 +148,7 @@ pandas 0.11.1
 .. _GH3552: https://github.com/pydata/pandas/issues/3552
 .. _GH3562: https://github.com/pydata/pandas/issues/3562
 .. _GH3586: https://github.com/pydata/pandas/issues/3586
+.. _GH3561: https://github.com/pydata/pandas/issues/3561
 .. _GH3493: https://github.com/pydata/pandas/issues/3493
 .. _GH3579: https://github.com/pydata/pandas/issues/3579
 .. _GH3593: https://github.com/pydata/pandas/issues/3593
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
index d67a2d51cc1b8..55b7e653c3630 100644
--- a/doc/source/indexing.rst
+++ b/doc/source/indexing.rst
@@ -1368,6 +1368,9 @@ incompatible the new object internals are with the ``Index`` functions):
   - ``slice_locs``: returns the "range" to slice between two labels
   - ``get_indexer``: Computes the indexing vector for reindexing / data
     alignment purposes. See the source / docstrings for more on this
+  - ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data
+    alignment purposes when the index is non-unique. See the source / docstrings 
+    for more on this
   - ``reindex``: Does any pre-conversion of the input index then calls
     ``get_indexer``
   - ``union``, ``intersection``: computes the union or intersection of two
diff --git a/pandas/core/index.py b/pandas/core/index.py
index 7baae543714ec..4bf53c00d0f61 100644
--- a/pandas/core/index.py
+++ b/pandas/core/index.py
@@ -859,6 +859,19 @@ def get_indexer(self, target, method=None, limit=None):
 
         return com._ensure_platform_int(indexer)
 
+    def get_indexer_non_unique(self, target, **kwargs):
+        """ return an indexer suitable for takng from a non unique index
+            return the labels in the same order ast the target,
+            target must be an iterable """
+        target = _ensure_index(target)
+        pself, ptarget = self._possibly_promote(target)
+        if pself is not self or ptarget is not target:
+            return pself.get_indexer_non_unique(ptarget)
+
+        if self.is_all_dates:
+            return Index(Index(self.asi8)._engine.get_indexer_non_unique(target.asi8))
+        return Index(self._engine.get_indexer_non_unique(target.values))
+
     def _possibly_promote(self, other):
         # A hack, but it works
         from pandas.tseries.index import DatetimeIndex
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index bc8b7a3646a33..fd2c89b019a2f 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -458,8 +458,8 @@ def _reindex(keys, level=None):
             if labels.is_unique:
                 return _reindex(keyarr, level=level)
             else:
-                mask = labels.isin(keyarr)
-                return self.obj.take(mask.nonzero()[0], axis=axis, convert=False)
+                indexer = labels.get_indexer_non_unique(keyarr)
+                return self.obj.take(indexer, axis=axis, convert=False)
 
     def _convert_to_indexer(self, obj, axis=0):
         """
@@ -569,20 +569,7 @@ def _convert_to_indexer(self, obj, axis=0):
 
                     # non-unique (dups)
                     else:
-                        indexer = []
-                        check   = np.arange(len(labels))
-                        lvalues = labels.values
-                        for x in objarr:
-                            # ugh
-                            to_or = lib.map_infer(lvalues, x.__eq__)
-                            if not to_or.any():
-                                raise KeyError('%s not in index' % str(x))
-
-                            # add the indicies (as we want to take)
-                            indexer.extend(check[to_or])
-
-                        indexer = Index(indexer)
-
+                        indexer = check = labels.get_indexer_non_unique(objarr)
 
                 mask = check == -1
                 if mask.any():
diff --git a/pandas/index.pyx b/pandas/index.pyx
index 2ad5474549ec6..bd1bef9408717 100644
--- a/pandas/index.pyx
+++ b/pandas/index.pyx
@@ -267,8 +267,37 @@ cdef class IndexEngine:
         self._ensure_mapping_populated()
         return self.mapping.lookup(values)
 
+    def get_indexer_non_unique(self, targets):
+        """ return an indexer suitable for takng from a non unique index
+            return the labels in the same order ast the target """
 
+        cdef:
+            ndarray values
+            ndarray[int64_t] result
+            object v, val
+            int count = 0
+            Py_ssize_t i, j, n
+
+        self._ensure_mapping_populated()
+        values = self._get_index_values()
+        n = len(values)
+        n_t = len(targets)
+        result = np.empty(n, dtype=np.int64)
+
+        for i in range(n_t):
+            val = util.get_value_at(targets, i)
+
+            for j in range(n):
+                v = util.get_value_at(values, j)
+
+                if v == val:
+                   result[count] = j
+                   count += 1
+
+        if count == 0:
+            raise KeyError
 
+        return result[0:count]
 
 cdef class Int64Engine(IndexEngine):
 
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index ce89dda63597f..064731319da85 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -4668,8 +4668,28 @@ def _check_df(df,cols=None):
             with ensure_clean() as path:
                 df.to_csv(path,cols = cols,chunksize=chunksize)
                 rs_c = pd.read_csv(path,index_col=0)
-                rs_c.columns = df.columns
-                assert_frame_equal(df,rs_c,check_names=False)
+
+                # we wrote them in a different order
+                # so compare them in that order
+                if cols is not None:
+
+                    if df.columns.is_unique:
+                        rs_c.columns = cols
+                    else:
+                        rs_c.columns = df.columns.take(df.columns.get_indexer_non_unique(cols))
+
+                    for c in cols:
+                       obj_df = df[c]
+                       obj_rs = rs_c[c]
+                       if isinstance(obj_df,Series):
+                           assert_series_equal(obj_df,obj_rs)
+                       else:
+                           assert_frame_equal(obj_df,obj_rs,check_names=False) 
+
+                # wrote in the same order
+                else:
+                    rs_c.columns = df.columns
+                    assert_frame_equal(df,rs_c,check_names=False)
 
         chunksize=5
         N = int(chunksize*2.5)
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
index 01651f2674a90..0b51416ced2b5 100644
--- a/pandas/tests/test_indexing.py
+++ b/pandas/tests/test_indexing.py
@@ -784,6 +784,16 @@ def test_dups_fancy_indexing(self):
 
         assert_frame_equal(df,result)
 
+        # GH 3561, dups not in selected order
+        ind = ['A', 'A', 'B', 'C']
+        df = DataFrame({'test':range(len(ind))}, index=ind)
+        rows = ['C', 'B']
+        res = df.ix[rows]
+        self.assert_(rows == list(res.index))
+
+        res = df.ix[Index(rows)]
+        self.assert_(Index(rows).equals(res.index))
+
     def test_indexing_mixed_frame_bug(self):
 
         # GH3492

From b84d649354265d03c0ff0fdfc8fb1c15ebbf7bae Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Fri, 10 May 2013 11:56:01 -0400
Subject: [PATCH 2/2] BUG: handle missing indexers in duplicate indices
 similary to how unique handles (e.g. by reindexing)

---
 RELEASE.rst                   |  1 +
 pandas/core/index.py          | 16 +++++++++++-----
 pandas/core/indexing.py       | 22 +++++++++++++++++++---
 pandas/index.pyx              | 25 +++++++++++++++++--------
 pandas/lib.pyx                | 19 +++++++++++++++++++
 pandas/tests/test_frame.py    |  3 ++-
 pandas/tests/test_indexing.py | 12 ++++++++++++
 7 files changed, 81 insertions(+), 17 deletions(-)

diff --git a/RELEASE.rst b/RELEASE.rst
index 84c1bcf974549..4e6570669656d 100644
--- a/RELEASE.rst
+++ b/RELEASE.rst
@@ -91,6 +91,7 @@ pandas 0.11.1
       (removed warning) (GH2786_), and fix (GH3230_)
     - Fix to_csv to handle non-unique columns (GH3495_)
     - Duplicate indexes with getitem will return items in the correct order (GH3455_, GH3457_)
+      and handle missing elements like unique indices (GH3561_)
     - Duplicate indexes with and empty DataFrame.from_records will return a correct frame (GH3562_)
   - Fixed bug in groupby with empty series referencing a variable before assignment. (GH3510_)
   - Fixed bug in mixed-frame assignment with aligned series (GH3492_)
diff --git a/pandas/core/index.py b/pandas/core/index.py
index 4bf53c00d0f61..3e5a4f5676437 100644
--- a/pandas/core/index.py
+++ b/pandas/core/index.py
@@ -860,17 +860,23 @@ def get_indexer(self, target, method=None, limit=None):
         return com._ensure_platform_int(indexer)
 
     def get_indexer_non_unique(self, target, **kwargs):
-        """ return an indexer suitable for takng from a non unique index
-            return the labels in the same order ast the target,
-            target must be an iterable """
+        """ return an indexer suitable for taking from a non unique index
+            return the labels in the same order as the target, and
+            return a missing indexer into the target (missing are marked as -1
+            in the indexer); target must be an iterable """
         target = _ensure_index(target)
         pself, ptarget = self._possibly_promote(target)
         if pself is not self or ptarget is not target:
             return pself.get_indexer_non_unique(ptarget)
 
         if self.is_all_dates:
-            return Index(Index(self.asi8)._engine.get_indexer_non_unique(target.asi8))
-        return Index(self._engine.get_indexer_non_unique(target.values))
+            self = Index(self.asi8)
+            tgt_values = target.asi8
+        else:
+            tgt_values = target.values
+
+        indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
+        return Index(indexer), missing
 
     def _possibly_promote(self, other):
         # A hack, but it works
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index fd2c89b019a2f..29adce4e02591 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -458,8 +458,23 @@ def _reindex(keys, level=None):
             if labels.is_unique:
                 return _reindex(keyarr, level=level)
             else:
-                indexer = labels.get_indexer_non_unique(keyarr)
-                return self.obj.take(indexer, axis=axis, convert=False)
+                indexer, missing = labels.get_indexer_non_unique(keyarr)
+                check = indexer != -1
+                result = self.obj.take(indexer[check], axis=axis, convert=False)
+
+                # need to merge the result labels and the missing labels
+                if len(missing):
+                    l = np.arange(len(indexer))
+
+                    missing_labels = keyarr.take(missing)
+                    missing_labels_indexer = l[~check]
+                    cur_labels = result._get_axis(axis).values
+                    cur_labels_indexer = l[check]
+                    new_labels = lib.combine_from_indexers(cur_labels, cur_labels_indexer,
+                                                           missing_labels, missing_labels_indexer)
+                    result = result.reindex_axis(new_labels,axis=axis)
+
+                return result
 
     def _convert_to_indexer(self, obj, axis=0):
         """
@@ -569,7 +584,8 @@ def _convert_to_indexer(self, obj, axis=0):
 
                     # non-unique (dups)
                     else:
-                        indexer = check = labels.get_indexer_non_unique(objarr)
+                        indexer, missing = labels.get_indexer_non_unique(objarr)
+                        check = indexer
 
                 mask = check == -1
                 if mask.any():
diff --git a/pandas/index.pyx b/pandas/index.pyx
index bd1bef9408717..7d33d6083d0eb 100644
--- a/pandas/index.pyx
+++ b/pandas/index.pyx
@@ -269,23 +269,27 @@ cdef class IndexEngine:
 
     def get_indexer_non_unique(self, targets):
         """ return an indexer suitable for takng from a non unique index
-            return the labels in the same order ast the target """
+            return the labels in the same order ast the target
+            and a missing indexer into the targets (which correspond
+            to the -1 indicies in the results """
 
         cdef:
             ndarray values
-            ndarray[int64_t] result
+            ndarray[int64_t] result, missing
             object v, val
-            int count = 0
-            Py_ssize_t i, j, n
+            int count = 0, count_missing = 0
+            Py_ssize_t i, j, n, found
 
         self._ensure_mapping_populated()
         values = self._get_index_values()
         n = len(values)
         n_t = len(targets)
-        result = np.empty(n, dtype=np.int64)
+        result  = np.empty(n+n_t, dtype=np.int64)
+        missing = np.empty(n_t, dtype=np.int64)
 
         for i in range(n_t):
             val = util.get_value_at(targets, i)
+            found = 0
 
             for j in range(n):
                 v = util.get_value_at(values, j)
@@ -293,11 +297,16 @@ cdef class IndexEngine:
                 if v == val:
                    result[count] = j
                    count += 1
+                   found = 1
 
-        if count == 0:
-            raise KeyError
+            # value not found
+            if found == 0:
+                result[count] = -1
+                count += 1
+                missing[count_missing] = i
+                count_missing += 1
 
-        return result[0:count]
+        return result[0:count], missing[0:count_missing]
 
 cdef class Int64Engine(IndexEngine):
 
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
index d043691bc061e..30c65d9fcdd9f 100644
--- a/pandas/lib.pyx
+++ b/pandas/lib.pyx
@@ -416,6 +416,25 @@ def dicts_to_array(list dicts, list columns):
 
     return result
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def combine_from_indexers(ndarray a, ndarray[int64_t] a_indexer,
+                          ndarray b, ndarray[int64_t] b_indexer):
+    cdef:
+        Py_ssize_t i, n_a, n_b
+        ndarray result
+
+    n_a = len(a)
+    n_b = len(b)
+    result = np.empty(n_a+n_b,dtype=object)
+
+    for i in range(n_a):
+        result[a_indexer[i]] = a[i]
+    for i in range(n_b):
+        result[b_indexer[i]] = b[i]
+
+    return result
+
 
 def fast_zip(list ndarrays):
     '''
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index 064731319da85..e92cc22dccaf6 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -4676,7 +4676,8 @@ def _check_df(df,cols=None):
                     if df.columns.is_unique:
                         rs_c.columns = cols
                     else:
-                        rs_c.columns = df.columns.take(df.columns.get_indexer_non_unique(cols))
+                        indexer, missing = df.columns.get_indexer_non_unique(cols)
+                        rs_c.columns = df.columns.take(indexer)
 
                     for c in cols:
                        obj_df = df[c]
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
index 0b51416ced2b5..46fd98fc14ffb 100644
--- a/pandas/tests/test_indexing.py
+++ b/pandas/tests/test_indexing.py
@@ -794,6 +794,18 @@ def test_dups_fancy_indexing(self):
         res = df.ix[Index(rows)]
         self.assert_(Index(rows).equals(res.index))
 
+        rows = ['C','B','E']
+        res = df.ix[rows]
+        self.assert_(rows == list(res.index))
+
+        # inconcistent returns for unique/duplicate indices when values are missing
+        df = DataFrame(randn(4,3),index=list('ABCD'))
+        expected = df.ix[['E']]
+
+        dfnu = DataFrame(randn(5,3),index=list('AABCD'))
+        result = dfnu.ix[['E']]
+        assert_frame_equal(result, expected)
+
     def test_indexing_mixed_frame_bug(self):
 
         # GH3492