diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index d64dbf6e14345..6e065c5818616 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -119,6 +119,7 @@ Bug Fixes - Bug in ``BlockManager`` where setting values with different type would break block integrity (:issue:`8850`) - Bug in ``DatetimeIndex`` when using ``time`` object as key (:issue:`8667`) - Bug in ``merge`` where ``how='left'`` and ``sort=False`` would not preserve left frame order (:issue:`7331`) +- Bug in ``MultiIndex.reindex`` where reindexing at level would not reorder labels (:issue:`4088`) - Fix negative step support for label-based slices (:issue:`8753`) diff --git a/pandas/core/index.py b/pandas/core/index.py index 7d9f772126483..be17c36e65675 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1828,13 +1828,41 @@ def _join_non_unique(self, other, how='left', return_indexers=False): else: return join_index - def _join_level(self, other, level, how='left', return_indexers=False): + def _join_level(self, other, level, how='left', + return_indexers=False, + keep_order=True): """ The join method *only* affects the level of the resulting MultiIndex. Otherwise it just exactly aligns the Index data to the - labels of the level in the MultiIndex. The order of the data indexed by - the MultiIndex will not be changed (currently) - """ + labels of the level in the MultiIndex. If `keep_order` == True, the + order of the data indexed by the MultiIndex will not be changed; + otherwise, it will tie out with `other`. + """ + from pandas.algos import groupsort_indexer + + def _get_leaf_sorter(labels): + ''' + returns sorter for the inner most level while preserving the + order of higher levels + ''' + if labels[0].size == 0: + return np.empty(0, dtype='int64') + + if len(labels) == 1: + lab = com._ensure_int64(labels[0]) + sorter, _ = groupsort_indexer(lab, 1 + lab.max()) + return sorter + + # find indexers of begining of each set of + # same-key labels w.r.t all but last level + tic = labels[0][:-1] != labels[0][1:] + for lab in labels[1:-1]: + tic |= lab[:-1] != lab[1:] + + starts = np.hstack(([True], tic, [True])).nonzero()[0] + lab = com._ensure_int64(labels[-1]) + return lib.get_level_sorter(lab, starts) + if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): raise TypeError('Join on level between two MultiIndex objects ' 'is ambiguous') @@ -1849,33 +1877,69 @@ def _join_level(self, other, level, how='left', return_indexers=False): level = left._get_level_number(level) old_level = left.levels[level] + if not right.is_unique: + raise NotImplementedError('Index._join_level on non-unique index ' + 'is not implemented') + new_level, left_lev_indexer, right_lev_indexer = \ old_level.join(right, how=how, return_indexers=True) - if left_lev_indexer is not None: + if left_lev_indexer is None: + if keep_order or len(left) == 0: + left_indexer = None + join_index = left + else: # sort the leaves + left_indexer = _get_leaf_sorter(left.labels[:level + 1]) + join_index = left[left_indexer] + + else: left_lev_indexer = com._ensure_int64(left_lev_indexer) rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level)) new_lev_labels = com.take_nd(rev_indexer, left.labels[level], allow_fill=False) - omit_mask = new_lev_labels != -1 new_labels = list(left.labels) new_labels[level] = new_lev_labels - if not omit_mask.all(): - new_labels = [lab[omit_mask] for lab in new_labels] - new_levels = list(left.levels) new_levels[level] = new_level - join_index = MultiIndex(levels=new_levels, labels=new_labels, - names=left.names, verify_integrity=False) - left_indexer = np.arange(len(left))[new_lev_labels != -1] - else: - join_index = left - left_indexer = None + if keep_order: # just drop missing values. o.w. keep order + left_indexer = np.arange(len(left)) + mask = new_lev_labels != -1 + if not mask.all(): + new_labels = [lab[mask] for lab in new_labels] + left_indexer = left_indexer[mask] + + else: # tie out the order with other + if level == 0: # outer most level, take the fast route + ngroups = 1 + new_lev_labels.max() + left_indexer, counts = groupsort_indexer(new_lev_labels, + ngroups) + # missing values are placed first; drop them! + left_indexer = left_indexer[counts[0]:] + new_labels = [lab[left_indexer] for lab in new_labels] + + else: # sort the leaves + mask = new_lev_labels != -1 + mask_all = mask.all() + if not mask_all: + new_labels = [lab[mask] for lab in new_labels] + + left_indexer = _get_leaf_sorter(new_labels[:level + 1]) + new_labels = [lab[left_indexer] for lab in new_labels] + + # left_indexers are w.r.t masked frame. + # reverse to original frame! + if not mask_all: + left_indexer = mask.nonzero()[0][left_indexer] + + join_index = MultiIndex(levels=new_levels, + labels=new_labels, + names=left.names, + verify_integrity=False) if right_lev_indexer is not None: right_indexer = com.take_nd(right_lev_indexer, @@ -3925,7 +3989,8 @@ def reindex(self, target, method=None, level=None, limit=None): else: target = _ensure_index(target) target, indexer, _ = self._join_level(target, level, how='right', - return_indexers=True) + return_indexers=True, + keep_order=False) else: if self.equals(target): indexer = None diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 2a5b93d111acc..71aeaf0895035 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1138,6 +1138,27 @@ def row_bool_subset_object(ndarray[object, ndim=2] values, return out +@cython.boundscheck(False) +@cython.wraparound(False) +def get_level_sorter(ndarray[int64_t, ndim=1] label, + ndarray[int64_t, ndim=1] starts): + """ + argsort for a single level of a multi-index, keeping the order of higher + levels unchanged. `starts` points to starts of same-key indices w.r.t + to leading levels; equivalent to: + np.hstack([label[starts[i]:starts[i+1]].argsort(kind='mergesort') + + starts[i] for i in range(len(starts) - 1)]) + """ + cdef: + int64_t l, r + Py_ssize_t i + ndarray[int64_t, ndim=1] out = np.empty(len(label), dtype=np.int64) + + for i in range(len(starts) - 1): + l, r = starts[i], starts[i + 1] + out[l:r] = l + label[l:r].argsort(kind='mergesort') + + return out def group_count(ndarray[int64_t] values, Py_ssize_t size): cdef: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 67f86a1c6cb7e..40823537dbc04 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1897,6 +1897,66 @@ def test_reversed_reindex_ffill_raises(self): self.assertRaises(ValueError, df.reindex, dr[::-1], method='ffill') self.assertRaises(ValueError, df.reindex, dr[::-1], method='bfill') + def test_reindex_level(self): + from itertools import permutations + icol = ['jim', 'joe', 'jolie'] + + def verify_first_level(df, level, idx): + f = lambda val: np.nonzero(df[level] == val)[0] + i = np.concatenate(list(map(f, idx))) + left = df.set_index(icol).reindex(idx, level=level) + right = df.iloc[i].set_index(icol) + assert_frame_equal(left, right) + + def verify(df, level, idx, indexer): + left = df.set_index(icol).reindex(idx, level=level) + right = df.iloc[indexer].set_index(icol) + assert_frame_equal(left, right) + + df = pd.DataFrame({'jim':list('B' * 4 + 'A' * 2 + 'C' * 3), + 'joe':list('abcdeabcd')[::-1], + 'jolie':[10, 20, 30] * 3, + 'joline': np.random.randint(0, 1000, 9)}) + + target = [['C', 'B', 'A'], ['F', 'C', 'A', 'D'], ['A'], ['D', 'F'], + ['A', 'B', 'C'], ['C', 'A', 'B'], ['C', 'B'], ['C', 'A'], + ['A', 'B'], ['B', 'A', 'C'], ['A', 'C', 'B']] + + for idx in target: + verify_first_level(df, 'jim', idx) + + verify(df, 'joe', list('abcde'), [3, 2, 1, 0, 5, 4, 8, 7, 6]) + verify(df, 'joe', list('abcd'), [3, 2, 1, 0, 5, 8, 7, 6]) + verify(df, 'joe', list('abc'), [3, 2, 1, 8, 7, 6]) + verify(df, 'joe', list('eca'), [1, 3, 4, 6, 8]) + verify(df, 'joe', list('edc'), [0, 1, 4, 5, 6]) + verify(df, 'joe', list('eadbc'), [3, 0, 2, 1, 4, 5, 8, 7, 6]) + verify(df, 'joe', list('edwq'), [0, 4, 5]) + verify(df, 'joe', list('wq'), []) + + df = DataFrame({'jim':['mid'] * 5 + ['btm'] * 8 + ['top'] * 7, + 'joe':['3rd'] * 2 + ['1st'] * 3 + ['2nd'] * 3 + + ['1st'] * 2 + ['3rd'] * 3 + ['1st'] * 2 + + ['3rd'] * 3 + ['2nd'] * 2, + 'jolie':np.random.randint(0, 1000, 20), + 'joline': np.random.randn(20).round(3) * 10}) + + for idx in permutations(df['jim'].unique()): + for i in range(3): + verify_first_level(df, 'jim', idx[:i+1]) + + i = [2,3,4,0,1,8,9,5,6,7,10,11,12,13,14,18,19,15,16,17] + verify(df, 'joe', ['1st', '2nd', '3rd'], i) + + i = [0,1,2,3,4,10,11,12,5,6,7,8,9,15,16,17,18,19,13,14] + verify(df, 'joe', ['3rd', '2nd', '1st'], i) + + i = [0,1,5,6,7,10,11,12,18,19,15,16,17] + verify(df, 'joe', ['2nd', '3rd'], i) + + i = [0,1,2,3,4,10,11,12,8,9,15,16,17,13,14] + verify(df, 'joe', ['3rd', '1st'], i) + def test_getitem_ix_float_duplicates(self): df = pd.DataFrame(np.random.randn(3, 3), index=[0.1, 0.2, 0.2], columns=list('abc'))