From aa98b8667ea071d014d942d9727e7d9010680d1c Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 25 Nov 2020 21:07:53 +0100 Subject: [PATCH 01/11] BUG: Index.drop raising Error when Index has duplicates --- doc/source/whatsnew/v1.2.0.rst | 2 ++ pandas/core/frame.py | 2 +- pandas/core/indexes/base.py | 5 ++++- pandas/core/indexes/multi.py | 3 ++- pandas/tests/indexes/multi/test_drop.py | 8 ++++++++ pandas/tests/indexes/test_base.py | 7 +++++++ 6 files changed, 24 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ad5af5df710ba..04b8fd1451736 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -636,6 +636,7 @@ MultiIndex - Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`) - Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`) - Bug in :meth:`MultiIndex.drop` dropped ``NaN`` values when non existing key was given as input (:issue:`18853`) +- Bug in :meth:`MultiIndex.drop` dropping more values than expected when index has duplicates and is not sorted (:issue:`38051`) I/O ^^^ @@ -758,6 +759,7 @@ Other - Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError``, from a bare ``Exception`` previously (:issue:`35744`) - Bug in ``accessor.DirNamesMixin``, where ``dir(obj)`` wouldn't show attributes defined on the instance (:issue:`37173`). - Bug in :meth:`Series.nunique` with ``dropna=True`` was returning incorrect results when both ``NA`` and ``None`` missing values were present (:issue:`37566`) +- Bug in :meth:`Index.drop` raising ``InvalidIndexError`` when index has duplicates (:issue:`38051`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5b87c4ea8b9cc..5b7edaf97db73 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3214,7 +3214,7 @@ def _set_value(self, index, col, value, takeable: bool = False): series._values[loc] = value # Note: trying to use series._set_value breaks tests in # tests.frame.indexing.test_indexing and tests.indexing.test_partial - except (KeyError, TypeError): + except (KeyError, TypeError, ValueError): # set using a non-recursive method & reset the cache if takeable: self.iloc[index, col] = value diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b5900ead246f3..15854d9e1f1f6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5508,7 +5508,10 @@ def drop(self, labels, errors: str_t = "raise"): """ arr_dtype = "object" if self.dtype == "object" else None labels = com.index_labels_to_array(labels, dtype=arr_dtype) - indexer = self.get_indexer(labels) + if not self.is_unique: + indexer = self.get_indexer(labels) + else: + indexer, _ = self.get_indexer_non_unique(labels) mask = indexer == -1 if mask.any(): if errors != "ignore": diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index be0b0c5208b1c..949369c84487f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2169,7 +2169,8 @@ def drop(self, codes, level=None, errors="raise"): if isinstance(loc, int): inds.append(loc) elif isinstance(loc, slice): - inds.extend(range(loc.start, loc.stop)) + step = loc.step if loc.step is not None else 1 + inds.extend(range(loc.start, loc.stop, step)) elif com.is_bool_indexer(loc): if self.lexsort_depth == 0: warnings.warn( diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py index 06019ed0a8b14..ec837474c1cf5 100644 --- a/pandas/tests/indexes/multi/test_drop.py +++ b/pandas/tests/indexes/multi/test_drop.py @@ -147,3 +147,11 @@ def test_drop_with_nan_in_index(nulls_fixture): msg = r"labels \[Timestamp\('2001-01-01 00:00:00'\)\] not found in level" with pytest.raises(KeyError, match=msg): mi.drop(pd.Timestamp("2001"), level="date") + + +def test_drop_with_non_monotonic_duplicates(): + # GH#38051 + mi = MultiIndex.from_tuples([(1, 2), (2, 3), (1, 2)]) + result = mi.drop((1, 2)) + expected = MultiIndex.from_tuples([(2, 3)]) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 2e3a70e8c2215..92d3a3385547b 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1496,6 +1496,13 @@ def test_drop_tuple(self, values, to_drop): with pytest.raises(KeyError, match=msg): removed.drop(drop_me) + def test_drop_with_duplicates_in_index(self): + # GH38051 + index = Index([0, 1, 0, 1]) + result = index.drop(0) + expected = Index([1, 1]) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( "attr", [ From 63164691e6dff33881f8579c8943ff5dd6d8eebd Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 25 Nov 2020 21:08:12 +0100 Subject: [PATCH 02/11] Remove not --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 15854d9e1f1f6..efde6e66fd4d8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5508,7 +5508,7 @@ def drop(self, labels, errors: str_t = "raise"): """ arr_dtype = "object" if self.dtype == "object" else None labels = com.index_labels_to_array(labels, dtype=arr_dtype) - if not self.is_unique: + if self.is_unique: indexer = self.get_indexer(labels) else: indexer, _ = self.get_indexer_non_unique(labels) From a5a28bc08bd29be5a8b30467220a946f1acc8889 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 25 Nov 2020 21:10:31 +0100 Subject: [PATCH 03/11] Remove value error --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5b7edaf97db73..5b87c4ea8b9cc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3214,7 +3214,7 @@ def _set_value(self, index, col, value, takeable: bool = False): series._values[loc] = value # Note: trying to use series._set_value breaks tests in # tests.frame.indexing.test_indexing and tests.indexing.test_partial - except (KeyError, TypeError, ValueError): + except (KeyError, TypeError): # set using a non-recursive method & reset the cache if takeable: self.iloc[index, col] = value From 965a7f0e496d425c0a5d7876be706bf13aac1a5f Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 25 Nov 2020 21:28:14 +0100 Subject: [PATCH 04/11] Change issue number --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/tests/indexes/multi/test_drop.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 04b8fd1451736..0903ee7495957 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -636,7 +636,7 @@ MultiIndex - Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`) - Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`) - Bug in :meth:`MultiIndex.drop` dropped ``NaN`` values when non existing key was given as input (:issue:`18853`) -- Bug in :meth:`MultiIndex.drop` dropping more values than expected when index has duplicates and is not sorted (:issue:`38051`) +- Bug in :meth:`MultiIndex.drop` dropping more values than expected when index has duplicates and is not sorted (:issue:`33494`) I/O ^^^ diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py index ec837474c1cf5..d735a342c9540 100644 --- a/pandas/tests/indexes/multi/test_drop.py +++ b/pandas/tests/indexes/multi/test_drop.py @@ -150,7 +150,7 @@ def test_drop_with_nan_in_index(nulls_fixture): def test_drop_with_non_monotonic_duplicates(): - # GH#38051 + # GH#33494 mi = MultiIndex.from_tuples([(1, 2), (2, 3), (1, 2)]) result = mi.drop((1, 2)) expected = MultiIndex.from_tuples([(2, 3)]) From 237cdf38937e81fbbc36c9e498d1ff526c76919d Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 25 Nov 2020 21:37:23 +0100 Subject: [PATCH 05/11] Use get indexer for --- pandas/core/indexes/base.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index efde6e66fd4d8..7973e7a078be1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5508,10 +5508,7 @@ def drop(self, labels, errors: str_t = "raise"): """ arr_dtype = "object" if self.dtype == "object" else None labels = com.index_labels_to_array(labels, dtype=arr_dtype) - if self.is_unique: - indexer = self.get_indexer(labels) - else: - indexer, _ = self.get_indexer_non_unique(labels) + indexer = self.get_indexer_for(labels) mask = indexer == -1 if mask.any(): if errors != "ignore": From 8c9659d5721f1eb3032f0ac06973b4f672023ffd Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 25 Nov 2020 21:38:43 +0100 Subject: [PATCH 06/11] Catch warning --- pandas/tests/indexes/multi/test_drop.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py index d735a342c9540..a64962449dfb1 100644 --- a/pandas/tests/indexes/multi/test_drop.py +++ b/pandas/tests/indexes/multi/test_drop.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import pytest @@ -152,6 +154,8 @@ def test_drop_with_nan_in_index(nulls_fixture): def test_drop_with_non_monotonic_duplicates(): # GH#33494 mi = MultiIndex.from_tuples([(1, 2), (2, 3), (1, 2)]) - result = mi.drop((1, 2)) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", PerformanceWarning) + result = mi.drop((1, 2)) expected = MultiIndex.from_tuples([(2, 3)]) tm.assert_index_equal(result, expected) From caf05a713be9c331744889529541219525aa9608 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 25 Nov 2020 23:37:59 +0100 Subject: [PATCH 07/11] Change test --- pandas/tests/indexes/test_base.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 92d3a3385547b..72fb7c106c921 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1496,11 +1496,13 @@ def test_drop_tuple(self, values, to_drop): with pytest.raises(KeyError, match=msg): removed.drop(drop_me) - def test_drop_with_duplicates_in_index(self): + def test_drop_with_duplicates_in_index(self, index): # GH38051 - index = Index([0, 1, 0, 1]) - result = index.drop(0) - expected = Index([1, 1]) + if len(index) == 0: + return + expected = index.drop(index[0]).repeat(2) + index = index.repeat(2) + result = index.drop(index[0]) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( From c46fa3bb8e6b8c3c0ee1b61ec506787e05aecfd9 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 26 Nov 2020 23:14:27 +0100 Subject: [PATCH 08/11] Exit for multiindex because of performance warning when multiindex is not sorted --- pandas/tests/indexes/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 50589e87b0191..7c5d69de46d2c 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1496,7 +1496,7 @@ def test_drop_tuple(self, values, to_drop): def test_drop_with_duplicates_in_index(self, index): # GH38051 - if len(index) == 0: + if len(index) == 0 or isinstance(index, MultiIndex): return expected = index.drop(index[0]).repeat(2) index = index.repeat(2) From e7ea44815fb4c2684b479c6f5fbdd1ee7f43dc9d Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 26 Nov 2020 23:24:34 +0100 Subject: [PATCH 09/11] Improve test --- pandas/tests/indexes/test_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7c5d69de46d2c..f92846a3ea644 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1498,8 +1498,8 @@ def test_drop_with_duplicates_in_index(self, index): # GH38051 if len(index) == 0 or isinstance(index, MultiIndex): return - expected = index.drop(index[0]).repeat(2) - index = index.repeat(2) + index = index.unique().repeat(2) + expected = index[2:] result = index.drop(index[0]) tm.assert_index_equal(result, expected) From 77392ed87034d23f08302877a7f4e6844a395c34 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 2 Dec 2020 19:54:39 +0100 Subject: [PATCH 10/11] Skip test --- pandas/tests/indexes/test_base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index d181a9e81f360..fb2d0f22498f9 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -9,6 +9,8 @@ import pytest from pandas._libs.tslib import Timestamp + +from pandas.compat import IS64 from pandas.compat.numpy import np_datetime64_compat from pandas.util._test_decorators import async_mark @@ -19,6 +21,7 @@ DatetimeIndex, Float64Index, Int64Index, + IntervalIndex, PeriodIndex, RangeIndex, Series, @@ -1509,6 +1512,8 @@ def test_drop_with_duplicates_in_index(self, index): # GH38051 if len(index) == 0 or isinstance(index, MultiIndex): return + if isinstance(index, IntervalIndex) and not IS64: + pytest.skip("Cannot test IntervalIndex with int64 dtype on 32 bit platform") index = index.unique().repeat(2) expected = index[2:] result = index.drop(index[0]) From f8b0b98e2a4b55691b22407a1317750b44431918 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 2 Dec 2020 20:20:01 +0100 Subject: [PATCH 11/11] Remove line --- pandas/tests/indexes/test_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index fb2d0f22498f9..d5ca8a0f64fac 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -9,7 +9,6 @@ import pytest from pandas._libs.tslib import Timestamp - from pandas.compat import IS64 from pandas.compat.numpy import np_datetime64_compat from pandas.util._test_decorators import async_mark