From ee30e60e637b296b0477cbb70d2f781b08ff5c8f Mon Sep 17 00:00:00 2001 From: Michael Odintsov Date: Tue, 19 Jun 2018 12:08:12 +0300 Subject: [PATCH 1/7] BUG: Fix group index calculation to prevent hitting maximum recursion depth (#21524) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/core/sorting.py | 25 +++++++++++++------------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 67c7ce150132a..e8e09880be248 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -60,6 +60,7 @@ Bug Fixes - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) - Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`) +- Bug in calculation of group index causing "maximum recursion depth exceeded" errors during ``DataFrame.duplicated`` calls (:issue:`21524`). - **I/O** diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index e550976d1deeb..ffa346f2de9d5 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -52,7 +52,17 @@ def _int64_cut_off(shape): return i return len(shape) - def loop(labels, shape): + def maybe_lift(lab, size): # pormote nan values + return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) + + labels = map(_ensure_int64, labels) + if not xnull: + labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) + + labels = list(labels) + shape = list(shape) + + while True: # how many levels can be done without overflow: nlev = _int64_cut_off(shape) @@ -74,7 +84,7 @@ def loop(labels, shape): out[mask] = -1 if nlev == len(shape): # all levels done! - return out + break # compress what has been done so far in order to avoid overflow # to retain lexical ranks, obs_ids should be sorted @@ -83,16 +93,7 @@ def loop(labels, shape): labels = [comp_ids] + labels[nlev:] shape = [len(obs_ids)] + shape[nlev:] - return loop(labels, shape) - - def maybe_lift(lab, size): # pormote nan values - return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) - - labels = map(_ensure_int64, labels) - if not xnull: - labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) - - return loop(list(labels), list(shape)) + return out def get_compressed_ids(labels, sizes): From 91a284eedab98bbd04cfe21e5f23917585b04104 Mon Sep 17 00:00:00 2001 From: Michael Odintsov Date: Tue, 19 Jun 2018 14:53:15 +0300 Subject: [PATCH 2/7] Adding more comments and updated changelog --- doc/source/whatsnew/v0.23.2.txt | 2 +- pandas/core/sorting.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index e8e09880be248..c51f9af5252f3 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -60,7 +60,7 @@ Bug Fixes - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) - Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`) -- Bug in calculation of group index causing "maximum recursion depth exceeded" errors during ``DataFrame.duplicated`` calls (:issue:`21524`). +- Bug in :func:`DataFrame.duplicated` with a large number of columns causing a 'maximum recursion depth exceeded' (:issue:`21524`). - **I/O** diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index ffa346f2de9d5..edf04fda42117 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -52,7 +52,8 @@ def _int64_cut_off(shape): return i return len(shape) - def maybe_lift(lab, size): # pormote nan values + def maybe_lift(lab, size): + # promote nan values (assigned to -1 here) so that all values are non-negative return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) labels = map(_ensure_int64, labels) @@ -62,6 +63,8 @@ def maybe_lift(lab, size): # pormote nan values labels = list(labels) shape = list(shape) + # Iteratively process all the labels in chunks sized so less than _INT64_MAX unique int ids + # will be required for each chunk while True: # how many levels can be done without overflow: nlev = _int64_cut_off(shape) From fac97d5a745a2d1382b853fd01db704905b9d4d3 Mon Sep 17 00:00:00 2001 From: Michael Odintsov Date: Tue, 19 Jun 2018 15:05:23 +0300 Subject: [PATCH 3/7] Added testcase --- pandas/tests/frame/test_analytics.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 6dc24ed856017..23dfd54a2bbc6 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1527,6 +1527,19 @@ def test_duplicated_with_misspelled_column_name(self, subset): with pytest.raises(KeyError): df.drop_duplicates(subset) + def test_duplicated_do_not_fail_on_wide_dataframes(self): + # Given the wide dataframe with a lot of columns with different (important!) values + data = {} + for i in range(100): + data['col_{0:02d}'.format(i)] = np.random.randint(0, 1000, 30000) + df = pd.DataFrame(data).T + # When we request to calculate duplicates + dupes = df.duplicated() + # Then we get the bool pd.Series as a result (don't fail during calculation) + # Actual values doesn't matter here, though usually it's all False in this case + assert isinstance(dupes, pd.Series) + assert dupes.dtype == np.bool + def test_drop_duplicates_with_duplicate_column_names(self): # GH17836 df = DataFrame([ From 7e9f3159274fc72e34e72e7e0d7a4a87a1063016 Mon Sep 17 00:00:00 2001 From: Michael Odintsov Date: Tue, 19 Jun 2018 15:07:50 +0300 Subject: [PATCH 4/7] linter --- pandas/core/sorting.py | 7 ++++--- pandas/tests/frame/test_analytics.py | 9 ++++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index edf04fda42117..fb94281cb416f 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -53,7 +53,8 @@ def _int64_cut_off(shape): return len(shape) def maybe_lift(lab, size): - # promote nan values (assigned to -1 here) so that all values are non-negative + # promote nan values (assigned to -1 here) + # so that all values are non-negative return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) labels = map(_ensure_int64, labels) @@ -63,8 +64,8 @@ def maybe_lift(lab, size): labels = list(labels) shape = list(shape) - # Iteratively process all the labels in chunks sized so less than _INT64_MAX unique int ids - # will be required for each chunk + # Iteratively process all the labels in chunks sized so less + # than _INT64_MAX unique int ids will be required for each chunk while True: # how many levels can be done without overflow: nlev = _int64_cut_off(shape) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 23dfd54a2bbc6..705f236b1296c 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1528,15 +1528,18 @@ def test_duplicated_with_misspelled_column_name(self, subset): df.drop_duplicates(subset) def test_duplicated_do_not_fail_on_wide_dataframes(self): - # Given the wide dataframe with a lot of columns with different (important!) values + # Given the wide dataframe with a lot of columns + # with different (important!) values data = {} for i in range(100): data['col_{0:02d}'.format(i)] = np.random.randint(0, 1000, 30000) df = pd.DataFrame(data).T # When we request to calculate duplicates dupes = df.duplicated() - # Then we get the bool pd.Series as a result (don't fail during calculation) - # Actual values doesn't matter here, though usually it's all False in this case + # Then we get the bool pd.Series as a result + # and don't fail during calculation. + # Actual values doesn't matter here, though usually + # it's all False in this case assert isinstance(dupes, pd.Series) assert dupes.dtype == np.bool From 727e654b0731b1a073e629cd4fbf15c051df94d5 Mon Sep 17 00:00:00 2001 From: Michael Odintsov Date: Wed, 20 Jun 2018 12:54:05 +0300 Subject: [PATCH 5/7] address PR review comments (better comment and mark test as slow) --- pandas/core/sorting.py | 4 ++-- pandas/tests/frame/test_analytics.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index fb94281cb416f..212f44e55c489 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -53,8 +53,8 @@ def _int64_cut_off(shape): return len(shape) def maybe_lift(lab, size): - # promote nan values (assigned to -1 here) - # so that all values are non-negative + # promote nan values (assigned -1 label in lab array) + # so that all output values are non-negative return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) labels = map(_ensure_int64, labels) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 705f236b1296c..8ebe6c3b3de23 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1527,6 +1527,7 @@ def test_duplicated_with_misspelled_column_name(self, subset): with pytest.raises(KeyError): df.drop_duplicates(subset) + @pytest.mark.slow def test_duplicated_do_not_fail_on_wide_dataframes(self): # Given the wide dataframe with a lot of columns # with different (important!) values From befa65d7b6314b3a9d37913524621511d2891e7f Mon Sep 17 00:00:00 2001 From: Michael Odintsov Date: Wed, 20 Jun 2018 13:16:20 +0300 Subject: [PATCH 6/7] address more PR review comments (list comprehension and result name) --- pandas/tests/frame/test_analytics.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 8ebe6c3b3de23..240f81c1777c8 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1531,18 +1531,16 @@ def test_duplicated_with_misspelled_column_name(self, subset): def test_duplicated_do_not_fail_on_wide_dataframes(self): # Given the wide dataframe with a lot of columns # with different (important!) values - data = {} - for i in range(100): - data['col_{0:02d}'.format(i)] = np.random.randint(0, 1000, 30000) + data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) + for i in range(100)} df = pd.DataFrame(data).T - # When we request to calculate duplicates - dupes = df.duplicated() - # Then we get the bool pd.Series as a result + result = df.duplicated() + # Then duplicates produce the bool pd.Series as a result # and don't fail during calculation. # Actual values doesn't matter here, though usually # it's all False in this case - assert isinstance(dupes, pd.Series) - assert dupes.dtype == np.bool + assert isinstance(result, pd.Series) + assert result.dtype == np.bool def test_drop_duplicates_with_duplicate_column_names(self): # GH17836 From eebb8cf57709b679660b9b48c4db6594279a8b87 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 20 Jun 2018 06:24:35 -0400 Subject: [PATCH 7/7] doc --- pandas/tests/frame/test_analytics.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 240f81c1777c8..12ebdbe0fd3c7 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1529,12 +1529,14 @@ def test_duplicated_with_misspelled_column_name(self, subset): @pytest.mark.slow def test_duplicated_do_not_fail_on_wide_dataframes(self): + # gh-21524 # Given the wide dataframe with a lot of columns # with different (important!) values data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) for i in range(100)} df = pd.DataFrame(data).T result = df.duplicated() + # Then duplicates produce the bool pd.Series as a result # and don't fail during calculation. # Actual values doesn't matter here, though usually