From 93f9073f6b30f367b7ea9217ad4b8d3be6dd15ed Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Sun, 14 Dec 2014 18:54:12 -0500
Subject: [PATCH] BUG: Bug in MultiIndex.has_duplicates when having many levels
 causes an indexer overflow (GH9075)

---
 doc/source/api.rst              |  1 +
 doc/source/whatsnew/v0.16.0.txt |  2 ++
 pandas/core/index.py            | 15 ++++++++++--
 pandas/tests/test_base.py       |  3 +++
 pandas/tests/test_index.py      | 43 +++++++++++++++++++++++++++++++++
 5 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/doc/source/api.rst b/doc/source/api.rst
index d2f94c22f0335..b6fd14f425bd0 100644
--- a/doc/source/api.rst
+++ b/doc/source/api.rst
@@ -1176,6 +1176,7 @@ Attributes
    Index.is_monotonic_increasing
    Index.is_monotonic_decreasing
    Index.is_unique
+   Index.has_duplicates
    Index.dtype
    Index.inferred_type
    Index.is_all_dates
diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
index a377b9caadc0c..8d7cd610c14c3 100644
--- a/doc/source/whatsnew/v0.16.0.txt
+++ b/doc/source/whatsnew/v0.16.0.txt
@@ -48,3 +48,5 @@ Bug Fixes
 ~~~~~~~~~
 
 .. _whatsnew_0160.bug_fixes:
+
+- Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`)
diff --git a/pandas/core/index.py b/pandas/core/index.py
index 0f682893490dd..e082881d8e831 100644
--- a/pandas/core/index.py
+++ b/pandas/core/index.py
@@ -600,6 +600,10 @@ def is_unique(self):
         """ return if the index has unique values """
         return self._engine.is_unique
 
+    @property
+    def has_duplicates(self):
+        return not self.is_unique
+
     def is_boolean(self):
         return self.inferred_type in ['boolean']
 
@@ -3223,12 +3227,19 @@ def has_duplicates(self):
         """
         Return True if there are no unique groups
         """
-        # has duplicates
+
+        from pandas.core.groupby import _int64_overflow_possible
+
+        # if we have a possible overflow, then fallback to safe method
         shape = [len(lev) for lev in self.levels]
+        if _int64_overflow_possible(shape):
+            return self.duplicated().any()
+
+        # int64 capable
         group_index = np.zeros(len(self), dtype='i8')
         for i in range(len(shape)):
             stride = np.prod([x for x in shape[i + 1:]], dtype='i8')
-            group_index += self.labels[i] * stride
+            group_index += _ensure_int64(self.labels[i]) * stride
 
         if len(np.unique(group_index)) < len(group_index):
             return True
diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py
index 615346f34b5bf..be5e102691fa0 100644
--- a/pandas/tests/test_base.py
+++ b/pandas/tests/test_base.py
@@ -620,6 +620,9 @@ def test_duplicated_drop_duplicates(self):
                 tm.assert_index_equal(result, original)
                 self.assertFalse(result is original)
 
+                # has_duplicates
+                self.assertFalse(original.has_duplicates)
+
                 # create repeated values, 3rd and 5th values are duplicated
                 idx = original[list(range(len(original))) + [5, 3]]
                 expected = Index([False] * len(original) + [True, True])
diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
index c3b1ea6d742e3..5ce5ea84dec65 100644
--- a/pandas/tests/test_index.py
+++ b/pandas/tests/test_index.py
@@ -3435,6 +3435,49 @@ def test_has_duplicates(self):
                                    [0, 1, 2, 0, 0, 1, 2]])
         self.assertTrue(index.has_duplicates)
 
+        # GH 9075
+        t = [(u'x', u'out', u'z', 5, u'y', u'in', u'z', 169),
+             (u'x', u'out', u'z', 7, u'y', u'in', u'z', 119),
+             (u'x', u'out', u'z', 9, u'y', u'in', u'z', 135),
+             (u'x', u'out', u'z', 13, u'y', u'in', u'z', 145),
+             (u'x', u'out', u'z', 14, u'y', u'in', u'z', 158),
+             (u'x', u'out', u'z', 16, u'y', u'in', u'z', 122),
+             (u'x', u'out', u'z', 17, u'y', u'in', u'z', 160),
+             (u'x', u'out', u'z', 18, u'y', u'in', u'z', 180),
+             (u'x', u'out', u'z', 20, u'y', u'in', u'z', 143),
+             (u'x', u'out', u'z', 21, u'y', u'in', u'z', 128),
+             (u'x', u'out', u'z', 22, u'y', u'in', u'z', 129),
+             (u'x', u'out', u'z', 25, u'y', u'in', u'z', 111),
+             (u'x', u'out', u'z', 28, u'y', u'in', u'z', 114),
+             (u'x', u'out', u'z', 29, u'y', u'in', u'z', 121),
+             (u'x', u'out', u'z', 31, u'y', u'in', u'z', 126),
+             (u'x', u'out', u'z', 32, u'y', u'in', u'z', 155),
+             (u'x', u'out', u'z', 33, u'y', u'in', u'z', 123),
+             (u'x', u'out', u'z', 12, u'y', u'in', u'z', 144)]
+        index = pd.MultiIndex.from_tuples(t)
+        self.assertFalse(index.has_duplicates)
+
+        # handle int64 overflow if possible
+        def check(nlevels):
+            labels = np.tile(np.arange(500), 2)
+            level = np.arange(500)
+
+            # no dups
+            index = MultiIndex(levels=[level] * nlevels + [[0, 1]],
+                               labels=[labels] * nlevels + [np.arange(2).repeat(500)])
+            self.assertFalse(index.has_duplicates)
+
+            # with a dup
+            values = index.values.tolist()
+            index = MultiIndex.from_tuples(values + [values[0]])
+            self.assertTrue(index.has_duplicates)
+
+        # no overflow
+        check(4)
+
+        # overflow possible
+        check(8)
+
     def test_tolist(self):
         result = self.index.tolist()
         exp = list(self.index.values)