From 93f9073f6b30f367b7ea9217ad4b8d3be6dd15ed Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 14 Dec 2014 18:54:12 -0500 Subject: [PATCH] BUG: Bug in MultiIndex.has_duplicates when having many levels causes an indexer overflow (GH9075) --- doc/source/api.rst | 1 + doc/source/whatsnew/v0.16.0.txt | 2 ++ pandas/core/index.py | 15 ++++++++++-- pandas/tests/test_base.py | 3 +++ pandas/tests/test_index.py | 43 +++++++++++++++++++++++++++++++++ 5 files changed, 62 insertions(+), 2 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index d2f94c22f0335..b6fd14f425bd0 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1176,6 +1176,7 @@ Attributes Index.is_monotonic_increasing Index.is_monotonic_decreasing Index.is_unique + Index.has_duplicates Index.dtype Index.inferred_type Index.is_all_dates diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index a377b9caadc0c..8d7cd610c14c3 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -48,3 +48,5 @@ Bug Fixes ~~~~~~~~~ .. _whatsnew_0160.bug_fixes: + +- Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`) diff --git a/pandas/core/index.py b/pandas/core/index.py index 0f682893490dd..e082881d8e831 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -600,6 +600,10 @@ def is_unique(self): """ return if the index has unique values """ return self._engine.is_unique + @property + def has_duplicates(self): + return not self.is_unique + def is_boolean(self): return self.inferred_type in ['boolean'] @@ -3223,12 +3227,19 @@ def has_duplicates(self): """ Return True if there are no unique groups """ - # has duplicates + + from pandas.core.groupby import _int64_overflow_possible + + # if we have a possible overflow, then fallback to safe method shape = [len(lev) for lev in self.levels] + if _int64_overflow_possible(shape): + return self.duplicated().any() + + # int64 capable group_index = np.zeros(len(self), dtype='i8') for i in range(len(shape)): stride = np.prod([x for x in shape[i + 1:]], dtype='i8') - group_index += self.labels[i] * stride + group_index += _ensure_int64(self.labels[i]) * stride if len(np.unique(group_index)) < len(group_index): return True diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 615346f34b5bf..be5e102691fa0 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -620,6 +620,9 @@ def test_duplicated_drop_duplicates(self): tm.assert_index_equal(result, original) self.assertFalse(result is original) + # has_duplicates + self.assertFalse(original.has_duplicates) + # create repeated values, 3rd and 5th values are duplicated idx = original[list(range(len(original))) + [5, 3]] expected = Index([False] * len(original) + [True, True]) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index c3b1ea6d742e3..5ce5ea84dec65 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -3435,6 +3435,49 @@ def test_has_duplicates(self): [0, 1, 2, 0, 0, 1, 2]]) self.assertTrue(index.has_duplicates) + # GH 9075 + t = [(u'x', u'out', u'z', 5, u'y', u'in', u'z', 169), + (u'x', u'out', u'z', 7, u'y', u'in', u'z', 119), + (u'x', u'out', u'z', 9, u'y', u'in', u'z', 135), + (u'x', u'out', u'z', 13, u'y', u'in', u'z', 145), + (u'x', u'out', u'z', 14, u'y', u'in', u'z', 158), + (u'x', u'out', u'z', 16, u'y', u'in', u'z', 122), + (u'x', u'out', u'z', 17, u'y', u'in', u'z', 160), + (u'x', u'out', u'z', 18, u'y', u'in', u'z', 180), + (u'x', u'out', u'z', 20, u'y', u'in', u'z', 143), + (u'x', u'out', u'z', 21, u'y', u'in', u'z', 128), + (u'x', u'out', u'z', 22, u'y', u'in', u'z', 129), + (u'x', u'out', u'z', 25, u'y', u'in', u'z', 111), + (u'x', u'out', u'z', 28, u'y', u'in', u'z', 114), + (u'x', u'out', u'z', 29, u'y', u'in', u'z', 121), + (u'x', u'out', u'z', 31, u'y', u'in', u'z', 126), + (u'x', u'out', u'z', 32, u'y', u'in', u'z', 155), + (u'x', u'out', u'z', 33, u'y', u'in', u'z', 123), + (u'x', u'out', u'z', 12, u'y', u'in', u'z', 144)] + index = pd.MultiIndex.from_tuples(t) + self.assertFalse(index.has_duplicates) + + # handle int64 overflow if possible + def check(nlevels): + labels = np.tile(np.arange(500), 2) + level = np.arange(500) + + # no dups + index = MultiIndex(levels=[level] * nlevels + [[0, 1]], + labels=[labels] * nlevels + [np.arange(2).repeat(500)]) + self.assertFalse(index.has_duplicates) + + # with a dup + values = index.values.tolist() + index = MultiIndex.from_tuples(values + [values[0]]) + self.assertTrue(index.has_duplicates) + + # no overflow + check(4) + + # overflow possible + check(8) + def test_tolist(self): result = self.index.tolist() exp = list(self.index.values)