From 5e765d0632c0aad5e27bd93be12137b0721eb6b4 Mon Sep 17 00:00:00 2001 From: Nicolas Bonnotte Date: Thu, 28 Jan 2016 10:09:01 +0100 Subject: [PATCH] BUG in MultiIndex.drop for not-lexsorted multi-indexes, #12078 Closes #12078 --- doc/source/whatsnew/v0.18.0.txt | 4 ++-- pandas/indexes/multi.py | 16 +++++++++++++++- pandas/tests/indexes/test_multi.py | 23 +++++++++++++++++++++++ 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 3a188ea20f8a3..7d312165fab74 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -523,7 +523,7 @@ Bug Fixes - Bug in ``read_sql`` with ``pymysql`` connections failing to return chunked data (:issue:`11522`) - Bug in ``.to_csv`` ignoring formatting parameters ``decimal``, ``na_rep``, ``float_format`` for float indexes (:issue:`11553`) - Bug in ``Int64Index`` and ``Float64Index`` preventing the use of the modulo operator (:issue:`9244`) - +- Bug in ``MultiIndex.drop`` for not lexsorted multi-indexes (:issue:`12078`) - Bug in ``DataFrame`` when masking an empty ``DataFrame`` (:issue:`11859`) @@ -544,4 +544,4 @@ Bug Fixes - Bug in ``.skew`` and ``.kurt`` due to roundoff error for highly similar values (:issue:`11974`) -- Bug in ``buffer_rd_bytes`` src->buffer could be freed more than once if reading failed, causing a segfault (:issue:`12098`) +- Bug in ``buffer_rd_bytes`` src->buffer could be freed more than once if reading failed, causing a segfault (:issue:`12098`) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 2d0ad1925daa0..1b7f057de9677 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1083,10 +1083,24 @@ def drop(self, labels, level=None, errors='raise'): for label in labels: try: loc = self.get_loc(label) + # get_loc returns either an integer, a slice, or a boolean + # mask if isinstance(loc, int): inds.append(loc) - else: + elif isinstance(loc, slice): inds.extend(lrange(loc.start, loc.stop)) + elif is_bool_indexer(loc): + if self.lexsort_depth == 0: + warnings.warn('dropping on a non-lexsorted multi-index' + 'without a level parameter may impact ' + 'performance.', + PerformanceWarning, + stacklevel=2) + loc = loc.nonzero()[0] + inds.extend(loc) + else: + msg = 'unsupported indexer of type {}'.format(type(loc)) + raise AssertionError(msg) except KeyError: if errors != 'ignore': raise diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 6bc644d84b0d0..6d49f5dcb342e 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -8,6 +8,7 @@ from pandas import (date_range, MultiIndex, Index, CategoricalIndex, compat) +from pandas.io.common import PerformanceWarning from pandas.indexes.base import InvalidIndexError from pandas.compat import range, lrange, u, PY3, long, lzip @@ -1419,6 +1420,28 @@ def test_droplevel_multiple(self): expected = index[:2].droplevel(2).droplevel(0) self.assertTrue(dropped.equals(expected)) + def test_drop_not_lexsorted(self): + # GH 12078 + + # define the lexsorted version of the multi-index + tuples = [('a', ''), ('b1', 'c1'), ('b2', 'c2')] + lexsorted_mi = MultiIndex.from_tuples(tuples, names=['b', 'c']) + self.assertTrue(lexsorted_mi.is_lexsorted()) + + # and the not-lexsorted version + df = pd.DataFrame(columns=['a', 'b', 'c', 'd'], + data=[[1, 'b1', 'c1', 3], [1, 'b2', 'c2', 4]]) + df = df.pivot_table(index='a', columns=['b', 'c'], values='d') + df = df.reset_index() + not_lexsorted_mi = df.columns + self.assertFalse(not_lexsorted_mi.is_lexsorted()) + + # compare the results + self.assert_index_equal(lexsorted_mi, not_lexsorted_mi) + with self.assert_produces_warning(PerformanceWarning): + self.assert_index_equal(lexsorted_mi.drop('a'), + not_lexsorted_mi.drop('a')) + def test_insert(self): # key contained in all levels new_index = self.index.insert(0, ('bar', 'two'))