From 03af0c191b94ec5d4fd811535ad2d5694321abe6 Mon Sep 17 00:00:00 2001 From: HyunTruth <1234hjlee@naver.com> Date: Fri, 17 Aug 2018 11:52:58 +0900 Subject: [PATCH 01/13] Bug in :meth:`DataFrame.drop_duplicates`for empty DataFrame throws error (:issue:`20516`) --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/frame.py | 3 +++ pandas/tests/frame/test_duplicates.py | 7 +++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index cf12759c051fc..56c64b35d0131 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -711,7 +711,7 @@ Reshaping - Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) - Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) - Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the `to_replace` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) -- +- Bug in :meth:`DataFrame.drop_duplicates`for empty DataFrame throws error (:issue:`20516`) Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 78ad9728800d6..89ac69f496649 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4369,6 +4369,9 @@ def duplicated(self, subset=None, keep='first'): from pandas.core.sorting import get_group_index from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT + if self.empty: + return Series() + def f(vals): labels, shape = algorithms.factorize( vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 940692ec5b46a..f5b16f70c68ee 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -263,6 +263,13 @@ def test_drop_duplicates_tuple(): tm.assert_frame_equal(result, expected) +def test_drop_duplicates_empty(): + # GH 20516 + df = DataFrame() + result = df.drop_duplicates() + tm.assert_frame_equal(result, df) + + def test_drop_duplicates_NA(): # none df = DataFrame({'A': [None, None, 'foo', 'bar', From 79ee155e1e6ccf0ad60fac1199d6b7025d191736 Mon Sep 17 00:00:00 2001 From: HyunTruth <1234hjlee@naver.com> Date: Fri, 17 Aug 2018 13:02:24 +0900 Subject: [PATCH 02/13] fixed what's new to render well --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 56c64b35d0131..fe9d0e45d765c 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -711,7 +711,7 @@ Reshaping - Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) - Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) - Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the `to_replace` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) -- Bug in :meth:`DataFrame.drop_duplicates`for empty DataFrame throws error (:issue:`20516`) +- Bug in :meth:`DataFrame.drop_duplicates` for empty DataFrame throws error (:issue:`20516`) Build Changes ^^^^^^^^^^^^^ From 31f609993bc9ff7481021bc0c982adc0cfe87bbb Mon Sep 17 00:00:00 2001 From: HyunTruth <1234hjlee@naver.com> Date: Sat, 18 Aug 2018 09:33:50 +0900 Subject: [PATCH 03/13] Applied changes according to reviews by @jreback --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/tests/frame/test_duplicates.py | 23 ++++++++++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index fe9d0e45d765c..a6a71c65284fd 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -711,7 +711,7 @@ Reshaping - Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) - Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) - Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the `to_replace` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) -- Bug in :meth:`DataFrame.drop_duplicates` for empty DataFrame throws error (:issue:`20516`) +- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises error (:issue:`20516`) Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index f5b16f70c68ee..58b8020c38421 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -265,9 +265,26 @@ def test_drop_duplicates_tuple(): def test_drop_duplicates_empty(): # GH 20516 - df = DataFrame() - result = df.drop_duplicates() - tm.assert_frame_equal(result, df) + expected = DataFrame() + result = expected.drop_duplicates() + tm.assert_frame_equal(result, expected) + + expected = DataFrame(columns=[]) + result = expected.drop_duplicates() + tm.assert_frame_equal(result, expected) + + expected = DataFrame(columns=['A', 'B', 'C']) + result = expected.drop_duplicates() + tm.assert_frame_equal(result, expected) + + expected = DataFrame(index=[]) + result = expected.drop_duplicates() + tm.assert_frame_equal(result, expected) + + expected = DataFrame(index=['A', 'B', 'C']) + result = expected.drop_duplicates() + tm.assert_frame_equal(result, expected) + def test_drop_duplicates_NA(): From 6eb53f6c968507527d7fb943f27feb43f8a2c661 Mon Sep 17 00:00:00 2001 From: HyunTruth <1234hjlee@naver.com> Date: Sat, 18 Aug 2018 09:36:19 +0900 Subject: [PATCH 04/13] removed an additional line --- pandas/tests/frame/test_duplicates.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 58b8020c38421..2a4fb23a12ddb 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -286,7 +286,6 @@ def test_drop_duplicates_empty(): tm.assert_frame_equal(result, expected) - def test_drop_duplicates_NA(): # none df = DataFrame({'A': [None, None, 'foo', 'bar', From de745bb426c0e154ed52e3ea235dea76a61e11a7 Mon Sep 17 00:00:00 2001 From: HyunTruth <1234hjlee@naver.com> Date: Sun, 19 Aug 2018 16:26:02 +0900 Subject: [PATCH 05/13] changed test to accomodate the column behavior in selection --- pandas/tests/frame/test_duplicates.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 2a4fb23a12ddb..32b4f42b91bbb 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -273,8 +273,9 @@ def test_drop_duplicates_empty(): result = expected.drop_duplicates() tm.assert_frame_equal(result, expected) - expected = DataFrame(columns=['A', 'B', 'C']) - result = expected.drop_duplicates() + df = DataFrame(columns=['A', 'B', 'C']) + result = df.drop_duplicates() + expected = DataFrame(columns=[]) # The column infos are not carrying over tm.assert_frame_equal(result, expected) expected = DataFrame(index=[]) From 3a5d97d2c4120782b2400c80c6c9b1febbef7031 Mon Sep 17 00:00:00 2001 From: jin Date: Wed, 22 Aug 2018 17:42:40 +0900 Subject: [PATCH 06/13] Parameterized the tests --- pandas/tests/frame/test_duplicates.py | 32 +++++++++------------------ 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 32b4f42b91bbb..88a49b3324528 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -262,29 +262,19 @@ def test_drop_duplicates_tuple(): result = df.drop_duplicates((('AA', 'AB'), 'B')) tm.assert_frame_equal(result, expected) - -def test_drop_duplicates_empty(): +@pytest.mark.parametrize('df', [ + DataFrame(), + DataFrame(columns=[]), + DataFrame(columns=['A', 'B', 'C']), + DataFrame(index=[]), + DataFrame(index=['A', 'B', 'C']) +]) +def test_drop_duplicates_empty(df): # GH 20516 - expected = DataFrame() - result = expected.drop_duplicates() - tm.assert_frame_equal(result, expected) - - expected = DataFrame(columns=[]) - result = expected.drop_duplicates() - tm.assert_frame_equal(result, expected) - - df = DataFrame(columns=['A', 'B', 'C']) result = df.drop_duplicates() - expected = DataFrame(columns=[]) # The column infos are not carrying over - tm.assert_frame_equal(result, expected) - - expected = DataFrame(index=[]) - result = expected.drop_duplicates() - tm.assert_frame_equal(result, expected) - - expected = DataFrame(index=['A', 'B', 'C']) - result = expected.drop_duplicates() - tm.assert_frame_equal(result, expected) + if df.columns.empty is False: + result = DataFrame(columns=[]) + tm.assert_frame_equal(df, expected) def test_drop_duplicates_NA(): From 1f58a854815f0edd5f579155866ee65ea92866d5 Mon Sep 17 00:00:00 2001 From: hyuntruth Date: Wed, 22 Aug 2018 17:45:49 +0900 Subject: [PATCH 07/13] Parameterized the tests --- pandas/tests/frame/test_duplicates.py | 32 +++++++++------------------ 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 32b4f42b91bbb..88a49b3324528 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -262,29 +262,19 @@ def test_drop_duplicates_tuple(): result = df.drop_duplicates((('AA', 'AB'), 'B')) tm.assert_frame_equal(result, expected) - -def test_drop_duplicates_empty(): +@pytest.mark.parametrize('df', [ + DataFrame(), + DataFrame(columns=[]), + DataFrame(columns=['A', 'B', 'C']), + DataFrame(index=[]), + DataFrame(index=['A', 'B', 'C']) +]) +def test_drop_duplicates_empty(df): # GH 20516 - expected = DataFrame() - result = expected.drop_duplicates() - tm.assert_frame_equal(result, expected) - - expected = DataFrame(columns=[]) - result = expected.drop_duplicates() - tm.assert_frame_equal(result, expected) - - df = DataFrame(columns=['A', 'B', 'C']) result = df.drop_duplicates() - expected = DataFrame(columns=[]) # The column infos are not carrying over - tm.assert_frame_equal(result, expected) - - expected = DataFrame(index=[]) - result = expected.drop_duplicates() - tm.assert_frame_equal(result, expected) - - expected = DataFrame(index=['A', 'B', 'C']) - result = expected.drop_duplicates() - tm.assert_frame_equal(result, expected) + if df.columns.empty is False: + result = DataFrame(columns=[]) + tm.assert_frame_equal(df, expected) def test_drop_duplicates_NA(): From cab09581d0e64ae2cd077a12d7ec2c48a0e88df2 Mon Sep 17 00:00:00 2001 From: hyuntruth Date: Wed, 22 Aug 2018 17:49:10 +0900 Subject: [PATCH 08/13] Adhere to flake8 --- pandas/tests/frame/test_duplicates.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 88a49b3324528..fdc76b664b1a2 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -262,6 +262,7 @@ def test_drop_duplicates_tuple(): result = df.drop_duplicates((('AA', 'AB'), 'B')) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize('df', [ DataFrame(), DataFrame(columns=[]), @@ -274,7 +275,7 @@ def test_drop_duplicates_empty(df): result = df.drop_duplicates() if df.columns.empty is False: result = DataFrame(columns=[]) - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(result, df) def test_drop_duplicates_NA(): From 68d69dbd8fa592f3a5120a92717a9ef7d00f7a41 Mon Sep 17 00:00:00 2001 From: hyuntruth Date: Wed, 22 Aug 2018 18:14:04 +0900 Subject: [PATCH 09/13] switched df --- pandas/tests/frame/test_duplicates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index fdc76b664b1a2..7e1b5748bbf03 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -274,7 +274,7 @@ def test_drop_duplicates_empty(df): # GH 20516 result = df.drop_duplicates() if df.columns.empty is False: - result = DataFrame(columns=[]) + df = DataFrame(columns=[]) tm.assert_frame_equal(result, df) From fb8845d421f0d52ec3ad4d19c5a11bff1a86ce81 Mon Sep 17 00:00:00 2001 From: HyunTruth <1234hjlee@naver.com> Date: Wed, 22 Aug 2018 21:17:01 +0900 Subject: [PATCH 10/13] Try catching for empty dataframes and return self --- pandas/core/frame.py | 3 +++ pandas/tests/frame/test_duplicates.py | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 89ac69f496649..4d98841027d46 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4335,6 +4335,9 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False): ------- deduplicated : DataFrame """ + if self.empty: + return self + inplace = validate_bool_kwarg(inplace, 'inplace') duplicated = self.duplicated(subset, keep=keep) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 7e1b5748bbf03..a767ef43091ae 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -273,8 +273,6 @@ def test_drop_duplicates_tuple(): def test_drop_duplicates_empty(df): # GH 20516 result = df.drop_duplicates() - if df.columns.empty is False: - df = DataFrame(columns=[]) tm.assert_frame_equal(result, df) From 1f125454c71492bdccaebc646c8ea56510d66429 Mon Sep 17 00:00:00 2001 From: HyunTruth <1234hjlee@naver.com> Date: Wed, 22 Aug 2018 21:29:45 +0900 Subject: [PATCH 11/13] change requested applied --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/frame.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index a6a71c65284fd..9a77e3accb3dc 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -711,7 +711,7 @@ Reshaping - Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) - Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) - Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the `to_replace` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) -- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises error (:issue:`20516`) +- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4d98841027d46..0e1463693a7fc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4336,7 +4336,7 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False): deduplicated : DataFrame """ if self.empty: - return self + return self.copy() inplace = validate_bool_kwarg(inplace, 'inplace') duplicated = self.duplicated(subset, keep=keep) From 20c03eff9b9bc34623607e8fe25f8ae711e87b28 Mon Sep 17 00:00:00 2001 From: HyunTruth <1234hjlee@naver.com> Date: Wed, 22 Aug 2018 21:56:29 +0900 Subject: [PATCH 12/13] added inplace=True tests --- pandas/tests/frame/test_duplicates.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index a767ef43091ae..56838b5b1c53d 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -275,6 +275,9 @@ def test_drop_duplicates_empty(df): result = df.drop_duplicates() tm.assert_frame_equal(result, df) + result = df.drop_duplicates(inplace=True) + tm.assert_frame_equal(result, df) + def test_drop_duplicates_NA(): # none From fc61899019cdf98e5a5b354fec52f29cef3fa41a Mon Sep 17 00:00:00 2001 From: HyunTruth <1234hjlee@naver.com> Date: Wed, 22 Aug 2018 22:38:00 +0900 Subject: [PATCH 13/13] rectified inplace test to reflect actual usage --- pandas/tests/frame/test_duplicates.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 56838b5b1c53d..3478d66b919a6 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -275,7 +275,8 @@ def test_drop_duplicates_empty(df): result = df.drop_duplicates() tm.assert_frame_equal(result, df) - result = df.drop_duplicates(inplace=True) + result = df.copy() + result.drop_duplicates(inplace=True) tm.assert_frame_equal(result, df)