From ee30e60e637b296b0477cbb70d2f781b08ff5c8f Mon Sep 17 00:00:00 2001
From: Michael Odintsov <michael@datarobot.com>
Date: Tue, 19 Jun 2018 12:08:12 +0300
Subject: [PATCH 1/7] BUG: Fix group index calculation to prevent hitting
 maximum recursion depth (#21524)

---
 doc/source/whatsnew/v0.23.2.txt |  1 +
 pandas/core/sorting.py          | 25 +++++++++++++------------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt
index 67c7ce150132a..e8e09880be248 100644
--- a/doc/source/whatsnew/v0.23.2.txt
+++ b/doc/source/whatsnew/v0.23.2.txt
@@ -60,6 +60,7 @@ Bug Fixes
 
 - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`)
 - Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`)
+- Bug in calculation of group index causing "maximum recursion depth exceeded" errors during ``DataFrame.duplicated`` calls  (:issue:`21524`).
 -
 
 **I/O**
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index e550976d1deeb..ffa346f2de9d5 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -52,7 +52,17 @@ def _int64_cut_off(shape):
                 return i
         return len(shape)
 
-    def loop(labels, shape):
+    def maybe_lift(lab, size):  # pormote nan values
+        return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
+
+    labels = map(_ensure_int64, labels)
+    if not xnull:
+        labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
+
+    labels = list(labels)
+    shape = list(shape)
+
+    while True:
         # how many levels can be done without overflow:
         nlev = _int64_cut_off(shape)
 
@@ -74,7 +84,7 @@ def loop(labels, shape):
             out[mask] = -1
 
         if nlev == len(shape):  # all levels done!
-            return out
+            break
 
         # compress what has been done so far in order to avoid overflow
         # to retain lexical ranks, obs_ids should be sorted
@@ -83,16 +93,7 @@ def loop(labels, shape):
         labels = [comp_ids] + labels[nlev:]
         shape = [len(obs_ids)] + shape[nlev:]
 
-        return loop(labels, shape)
-
-    def maybe_lift(lab, size):  # pormote nan values
-        return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
-
-    labels = map(_ensure_int64, labels)
-    if not xnull:
-        labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
-
-    return loop(list(labels), list(shape))
+    return out
 
 
 def get_compressed_ids(labels, sizes):

From 91a284eedab98bbd04cfe21e5f23917585b04104 Mon Sep 17 00:00:00 2001
From: Michael Odintsov <michael@datarobot.com>
Date: Tue, 19 Jun 2018 14:53:15 +0300
Subject: [PATCH 2/7] Adding more comments and updated changelog

---
 doc/source/whatsnew/v0.23.2.txt | 2 +-
 pandas/core/sorting.py          | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt
index e8e09880be248..c51f9af5252f3 100644
--- a/doc/source/whatsnew/v0.23.2.txt
+++ b/doc/source/whatsnew/v0.23.2.txt
@@ -60,7 +60,7 @@ Bug Fixes
 
 - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`)
 - Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`)
-- Bug in calculation of group index causing "maximum recursion depth exceeded" errors during ``DataFrame.duplicated`` calls  (:issue:`21524`).
+- Bug in :func:`DataFrame.duplicated` with a large number of columns causing a 'maximum recursion depth exceeded' (:issue:`21524`).
 -
 
 **I/O**
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index ffa346f2de9d5..edf04fda42117 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -52,7 +52,8 @@ def _int64_cut_off(shape):
                 return i
         return len(shape)
 
-    def maybe_lift(lab, size):  # pormote nan values
+    def maybe_lift(lab, size):
+        # promote nan values (assigned to -1 here) so that all values are non-negative
         return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
 
     labels = map(_ensure_int64, labels)
@@ -62,6 +63,8 @@ def maybe_lift(lab, size):  # pormote nan values
     labels = list(labels)
     shape = list(shape)
 
+    # Iteratively process all the labels in chunks sized so less than _INT64_MAX unique int ids
+    # will be required for each chunk
     while True:
         # how many levels can be done without overflow:
         nlev = _int64_cut_off(shape)

From fac97d5a745a2d1382b853fd01db704905b9d4d3 Mon Sep 17 00:00:00 2001
From: Michael Odintsov <michael@datarobot.com>
Date: Tue, 19 Jun 2018 15:05:23 +0300
Subject: [PATCH 3/7] Added testcase

---
 pandas/tests/frame/test_analytics.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
index 6dc24ed856017..23dfd54a2bbc6 100644
--- a/pandas/tests/frame/test_analytics.py
+++ b/pandas/tests/frame/test_analytics.py
@@ -1527,6 +1527,19 @@ def test_duplicated_with_misspelled_column_name(self, subset):
         with pytest.raises(KeyError):
             df.drop_duplicates(subset)
 
+    def test_duplicated_do_not_fail_on_wide_dataframes(self):
+        # Given the wide dataframe with a lot of columns with different (important!) values
+        data = {}
+        for i in range(100):
+            data['col_{0:02d}'.format(i)] = np.random.randint(0, 1000, 30000)
+        df = pd.DataFrame(data).T
+        # When we request to calculate duplicates
+        dupes = df.duplicated()
+        # Then we get the bool pd.Series as a result (don't fail during calculation)
+        # Actual values doesn't matter here, though usually it's all False in this case
+        assert isinstance(dupes, pd.Series)
+        assert dupes.dtype == np.bool
+
     def test_drop_duplicates_with_duplicate_column_names(self):
         # GH17836
         df = DataFrame([

From 7e9f3159274fc72e34e72e7e0d7a4a87a1063016 Mon Sep 17 00:00:00 2001
From: Michael Odintsov <michael@datarobot.com>
Date: Tue, 19 Jun 2018 15:07:50 +0300
Subject: [PATCH 4/7] linter

---
 pandas/core/sorting.py               | 7 ++++---
 pandas/tests/frame/test_analytics.py | 9 ++++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index edf04fda42117..fb94281cb416f 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -53,7 +53,8 @@ def _int64_cut_off(shape):
         return len(shape)
 
     def maybe_lift(lab, size):
-        # promote nan values (assigned to -1 here) so that all values are non-negative
+        # promote nan values (assigned to -1 here)
+        # so that all values are non-negative
         return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
 
     labels = map(_ensure_int64, labels)
@@ -63,8 +64,8 @@ def maybe_lift(lab, size):
     labels = list(labels)
     shape = list(shape)
 
-    # Iteratively process all the labels in chunks sized so less than _INT64_MAX unique int ids
-    # will be required for each chunk
+    # Iteratively process all the labels in chunks sized so less
+    # than _INT64_MAX unique int ids will be required for each chunk
     while True:
         # how many levels can be done without overflow:
         nlev = _int64_cut_off(shape)
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
index 23dfd54a2bbc6..705f236b1296c 100644
--- a/pandas/tests/frame/test_analytics.py
+++ b/pandas/tests/frame/test_analytics.py
@@ -1528,15 +1528,18 @@ def test_duplicated_with_misspelled_column_name(self, subset):
             df.drop_duplicates(subset)
 
     def test_duplicated_do_not_fail_on_wide_dataframes(self):
-        # Given the wide dataframe with a lot of columns with different (important!) values
+        # Given the wide dataframe with a lot of columns
+        # with different (important!) values
         data = {}
         for i in range(100):
             data['col_{0:02d}'.format(i)] = np.random.randint(0, 1000, 30000)
         df = pd.DataFrame(data).T
         # When we request to calculate duplicates
         dupes = df.duplicated()
-        # Then we get the bool pd.Series as a result (don't fail during calculation)
-        # Actual values doesn't matter here, though usually it's all False in this case
+        # Then we get the bool pd.Series as a result
+        # and don't fail during calculation.
+        # Actual values doesn't matter here, though usually
+        # it's all False in this case
         assert isinstance(dupes, pd.Series)
         assert dupes.dtype == np.bool
 

From 727e654b0731b1a073e629cd4fbf15c051df94d5 Mon Sep 17 00:00:00 2001
From: Michael Odintsov <michael@datarobot.com>
Date: Wed, 20 Jun 2018 12:54:05 +0300
Subject: [PATCH 5/7] address PR review comments (better comment and mark test
 as slow)

---
 pandas/core/sorting.py               | 4 ++--
 pandas/tests/frame/test_analytics.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index fb94281cb416f..212f44e55c489 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -53,8 +53,8 @@ def _int64_cut_off(shape):
         return len(shape)
 
     def maybe_lift(lab, size):
-        # promote nan values (assigned to -1 here)
-        # so that all values are non-negative
+        # promote nan values (assigned -1 label in lab array)
+        # so that all output values are non-negative
         return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
 
     labels = map(_ensure_int64, labels)
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
index 705f236b1296c..8ebe6c3b3de23 100644
--- a/pandas/tests/frame/test_analytics.py
+++ b/pandas/tests/frame/test_analytics.py
@@ -1527,6 +1527,7 @@ def test_duplicated_with_misspelled_column_name(self, subset):
         with pytest.raises(KeyError):
             df.drop_duplicates(subset)
 
+    @pytest.mark.slow
     def test_duplicated_do_not_fail_on_wide_dataframes(self):
         # Given the wide dataframe with a lot of columns
         # with different (important!) values

From befa65d7b6314b3a9d37913524621511d2891e7f Mon Sep 17 00:00:00 2001
From: Michael Odintsov <michael@datarobot.com>
Date: Wed, 20 Jun 2018 13:16:20 +0300
Subject: [PATCH 6/7] address more PR review comments (list comprehension and
 result name)

---
 pandas/tests/frame/test_analytics.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
index 8ebe6c3b3de23..240f81c1777c8 100644
--- a/pandas/tests/frame/test_analytics.py
+++ b/pandas/tests/frame/test_analytics.py
@@ -1531,18 +1531,16 @@ def test_duplicated_with_misspelled_column_name(self, subset):
     def test_duplicated_do_not_fail_on_wide_dataframes(self):
         # Given the wide dataframe with a lot of columns
         # with different (important!) values
-        data = {}
-        for i in range(100):
-            data['col_{0:02d}'.format(i)] = np.random.randint(0, 1000, 30000)
+        data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
+                for i in range(100)}
         df = pd.DataFrame(data).T
-        # When we request to calculate duplicates
-        dupes = df.duplicated()
-        # Then we get the bool pd.Series as a result
+        result = df.duplicated()
+        # Then duplicates produce the bool pd.Series as a result
         # and don't fail during calculation.
         # Actual values doesn't matter here, though usually
         # it's all False in this case
-        assert isinstance(dupes, pd.Series)
-        assert dupes.dtype == np.bool
+        assert isinstance(result, pd.Series)
+        assert result.dtype == np.bool
 
     def test_drop_duplicates_with_duplicate_column_names(self):
         # GH17836

From eebb8cf57709b679660b9b48c4db6594279a8b87 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Wed, 20 Jun 2018 06:24:35 -0400
Subject: [PATCH 7/7] doc

---
 pandas/tests/frame/test_analytics.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
index 240f81c1777c8..12ebdbe0fd3c7 100644
--- a/pandas/tests/frame/test_analytics.py
+++ b/pandas/tests/frame/test_analytics.py
@@ -1529,12 +1529,14 @@ def test_duplicated_with_misspelled_column_name(self, subset):
 
     @pytest.mark.slow
     def test_duplicated_do_not_fail_on_wide_dataframes(self):
+        # gh-21524
         # Given the wide dataframe with a lot of columns
         # with different (important!) values
         data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
                 for i in range(100)}
         df = pd.DataFrame(data).T
         result = df.duplicated()
+
         # Then duplicates produce the bool pd.Series as a result
         # and don't fail during calculation.
         # Actual values doesn't matter here, though usually