From cdefaae0fb3256a535d3f300af9dcbf5c9a2ce13 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 6 Oct 2020 22:05:05 +0200 Subject: [PATCH 01/20] Fix duplicates in intersectin of multiindexes --- doc/source/whatsnew/v1.1.4.rst | 3 ++- pandas/core/indexes/multi.py | 6 ++++-- pandas/tests/indexes/multi/test_setops.py | 14 ++++++++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index e63912ebc8fee..9d3e503f518f4 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -14,7 +14,8 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- + +- Regression in :meth:`MultiIndex.intersection` returned duplicates when at least one of the indexes had duplicates (:issue:`36915`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a157fdfdde447..6a621d78f266d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3468,6 +3468,8 @@ def intersection(self, other, sort=False): other, result_names = self._convert_can_do_setop(other) if self.equals(other): + if self.has_duplicates: + return self.unique() return self if not is_object_dtype(other.dtype): @@ -3486,11 +3488,11 @@ def intersection(self, other, sort=False): uniq_tuples = None # flag whether _inner_indexer was successful if self.is_monotonic and other.is_monotonic: try: - uniq_tuples = self._inner_indexer(lvals, rvals)[0] + inner_tuples = self._inner_indexer(lvals, rvals)[0] sort = False # uniq_tuples is already sorted except TypeError: pass - + uniq_tuples = algos.unique(inner_tuples) if uniq_tuples is None: other_uniq = set(rvals) seen = set() diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 6d4928547cad1..121cbe1716b10 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -376,3 +376,17 @@ def test_setops_disallow_true(method): with pytest.raises(ValueError, match="The 'sort' keyword only takes"): getattr(idx1, method)(idx2, sort=True) + + +@pytest.mark.parametrize( + "tuples", [[("val1", "test1")], [("val1", "test1"), ("val1", "test1")]] +) +def test_intersect_with_duplicates(tuples): + # GH: 36915 + left = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) + right = pd.MultiIndex.from_tuples( + [("val1", "test1"), ("val1", "test1")], names=["first", "second"] + ) + result = left.intersection(right) + expected = pd.MultiIndex.from_tuples([("val1", "test1")], names=["first", "second"]) + tm.assert_index_equal(result, expected) From fbd63f2d8c2ffd976f4fbc6369b36d8d50af27a5 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 6 Oct 2020 22:33:38 +0200 Subject: [PATCH 02/20] Fix duplicates in index intersection --- pandas/core/indexes/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4967e13a9855a..905f74b5c20a1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2746,7 +2746,7 @@ def intersection(self, other, sort=False): self._assert_can_do_setop(other) other = ensure_index(other) - if self.equals(other): + if self.equals(other) and not self.has_duplicates: return self._get_reconciled_name_object(other) if not is_dtype_equal(self.dtype, other.dtype): @@ -2764,6 +2764,7 @@ def intersection(self, other, sort=False): except TypeError: pass else: + result = algos.unique1d(result) return self._wrap_setop_result(other, result) try: @@ -2775,7 +2776,7 @@ def intersection(self, other, sort=False): indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] - taken = other.take(indexer) + taken = other.take(indexer).unique() res_name = get_op_result_name(self, other) if sort is None: From 53a37d11716983dc6c65b35989ccf7476a948e30 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 6 Oct 2020 22:39:41 +0200 Subject: [PATCH 03/20] Modify test and avoid None issues --- pandas/core/indexes/multi.py | 4 +++- pandas/tests/indexes/multi/test_setops.py | 17 +++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6a621d78f266d..c62d2a4491327 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3492,7 +3492,9 @@ def intersection(self, other, sort=False): sort = False # uniq_tuples is already sorted except TypeError: pass - uniq_tuples = algos.unique(inner_tuples) + else: + uniq_tuples = algos.unique(inner_tuples) + if uniq_tuples is None: other_uniq = set(rvals) seen = set() diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 121cbe1716b10..f084e76668793 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -379,14 +379,23 @@ def test_setops_disallow_true(method): @pytest.mark.parametrize( - "tuples", [[("val1", "test1")], [("val1", "test1"), ("val1", "test1")]] + ("tuples", "exp_tuples"), + [ + ([("val1", "test1")], [("val1", "test1")]), + ([("val1", "test1"), ("val1", "test1")], [("val1", "test1")]), + ( + [("val2", "test2"), ("val1", "test1")], + [("val2", "test2"), ("val1", "test1")], + ), + ], ) -def test_intersect_with_duplicates(tuples): +def test_intersect_with_duplicates(tuples, exp_tuples): # GH: 36915 left = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) right = pd.MultiIndex.from_tuples( - [("val1", "test1"), ("val1", "test1")], names=["first", "second"] + [("val1", "test1"), ("val1", "test1"), ("val2", "test2")], + names=["first", "second"], ) result = left.intersection(right) - expected = pd.MultiIndex.from_tuples([("val1", "test1")], names=["first", "second"]) + expected = pd.MultiIndex.from_tuples(exp_tuples, names=["first", "second"]) tm.assert_index_equal(result, expected) From 5675a4e712b26f7fea199207c73a76e354109a0c Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 6 Oct 2020 23:17:26 +0200 Subject: [PATCH 04/20] Fix failing test --- pandas/tests/indexes/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 8db1bcc84bfa6..8bc252bcc1120 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -683,7 +683,7 @@ def test_intersection_monotonic(self, index2, keeps_name, sort): @pytest.mark.parametrize( "index2,expected_arr", - [(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B", "A"])], + [(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B"])], ) def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort): # non-monotonic non-unique From 582c0b9d752326d36da64a066d105add00b25077 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 9 Oct 2020 09:16:31 +0200 Subject: [PATCH 05/20] Change comment --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3da8b1e5da139..9892c7e149de3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3494,7 +3494,7 @@ def intersection(self, other, sort=False): if self.is_monotonic and other.is_monotonic: try: inner_tuples = self._inner_indexer(lvals, rvals)[0] - sort = False # uniq_tuples is already sorted + sort = False # inner_tuples is already sorted except TypeError: pass else: From 7805de53a6ff47ca9bb9abb296e3b89246663596 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 11 Oct 2020 22:22:13 +0200 Subject: [PATCH 06/20] Add unique after intersection --- pandas/core/ops/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index ae21f13ea3f49..95a5353cc438b 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -481,7 +481,9 @@ def _should_reindex_frame_op( # TODO: any other cases we should handle here? cols = left.columns.intersection(right.columns) - if len(cols) and not (cols.equals(left.columns) and cols.equals(right.columns)): + if len(cols) and not ( + cols.equals(left.columns.unique()) and cols.equals(right.columns.unique()) + ): # TODO: is there a shortcut available when len(cols) == 0? return True From 66b519f8a80a66afad5e9b16828061466bbb0446 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 11 Oct 2020 22:49:10 +0200 Subject: [PATCH 07/20] Fix merge bug --- pandas/core/indexes/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7d2c7e7c95dfa..d06f6be7ac73f 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2757,8 +2757,7 @@ def intersection(self, other, sort=False): indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] - taken = other.take(indexer).unique() - res_name = get_op_result_name(self, other) + result = other.take(indexer).unique() if sort is None: result = algos.safe_sort(result.values) From cb1477b246a90f45c0774be4f2f696651936f7a9 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 11 Oct 2020 23:00:35 +0200 Subject: [PATCH 08/20] Add tests and whatsnew --- doc/source/whatsnew/v1.1.4.rst | 1 - doc/source/whatsnew/v1.2.0.rst | 4 +++- pandas/tests/indexes/test_setops.py | 10 ++++++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index 57604a38bb642..aa2c77da4ee6f 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -20,7 +20,6 @@ Fixed regressions - Fixed regression in :class:`RollingGroupby` with ``sort=False`` not being respected (:issue:`36889`) - Fixed regression in :meth:`Series.astype` converting ``None`` to ``"nan"`` when casting to string (:issue:`36904`) - Fixed regression in :class:`RollingGroupby` causing a segmentation fault with Index of dtype object (:issue:`36727`) -- Regression in :meth:`MultiIndex.intersection` returned duplicates when at least one of the indexes had duplicates (:issue:`36915`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index bd3112403b31b..3cb4f14d13ccd 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -405,7 +405,8 @@ MultiIndex - Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`) - Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`) -- +- Bug in :meth:`MultiIndex.intersection` returned duplicates when at least one of the indexes had duplicates (:issue:`36915`) + I/O ^^^ @@ -485,6 +486,7 @@ Other - Fixed metadata propagation in the :class:`Series.dt` and :class:`Series.str` accessors (:issue:`28283`) - Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`) - Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError``, from a bare ``Exception`` previously (:issue:`35744`) +- Bug in :meth:`Index.intersection` returned duplicates when at least one of the indexes had duplicates (:issue:`31326`) .. --------------------------------------------------------------------------- diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 1a40fe550be61..be54dce4fed26 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -95,3 +95,13 @@ def test_union_dtypes(left, right, expected): b = pd.Index([], dtype=right) result = (a | b).dtype assert result == expected + + +@pytest.mark.parametrize("values", [[1, 2, 2, 3], [3, 3]]) +def test_intersection_duplicates(values): + # GH: 31326 + a = pd.Index(values) + b = pd.Index([3, 3]) + result = a.intersection(b) + expected = pd.Index([3]) + tm.assert_index_equal(result, expected) From 0fb25612cc66c139cbd8d73e2ec0d9e13b62fbb5 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 26 Oct 2020 15:45:25 +0100 Subject: [PATCH 09/20] Add rename --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fe0a9a4e5e629..d545de47d17fa 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3476,7 +3476,7 @@ def intersection(self, other, sort=False): if self.equals(other): if self.has_duplicates: - return self.unique() + return self.unique().rename(result_names) return self.rename(result_names) if not is_object_dtype(other.dtype): From 10524fdde24403cf705a57bb6adfe2f3a94c71c6 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 26 Oct 2020 20:31:42 +0100 Subject: [PATCH 10/20] Fix check in merge operation --- pandas/core/reshape/merge.py | 9 +++++++-- pandas/tests/reshape/merge/test_merge.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5012be593820e..e5ce30095b710 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1207,7 +1207,9 @@ def _validate_specification(self): raise MergeError("Must pass left_on or left_index=True") else: # use the common columns - common_cols = self.left.columns.intersection(self.right.columns) + left_cols = self.left.columns + right_cols = self.right.columns + common_cols = left_cols.intersection(right_cols) if len(common_cols) == 0: raise MergeError( "No common columns to perform merge on. " @@ -1216,7 +1218,10 @@ def _validate_specification(self): f"left_index={self.left_index}, " f"right_index={self.right_index}" ) - if not common_cols.is_unique: + if ( + not left_cols.join(common_cols, how="inner").is_unique + or not right_cols.join(common_cols, how="inner").is_unique + ): raise MergeError(f"Data columns not unique: {repr(common_cols)}") self.left_on = self.right_on = common_cols elif self.on is not None: diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index c4c9b0e516192..e3d1a8bb29984 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -741,7 +741,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo', 'foo'\], dtype='object'\)" + msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) From 3dde0eef56f4f6b2b83c47af577c56001cbb1229 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 12 Nov 2020 12:51:42 +0100 Subject: [PATCH 11/20] Exit set ops when nonunique --- pandas/core/ops/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index fa4fff46196b0..d024709feac25 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -307,13 +307,14 @@ def should_reindex_frame_op( if not isinstance(right, ABCDataFrame): return False + if not left.columns.is_unique or not right.columns.is_unique: + return False + if fill_value is None and level is None and axis is default_axis: # TODO: any other cases we should handle here? cols = left.columns.intersection(right.columns) - if len(cols) and not ( - cols.equals(left.columns.unique()) and cols.equals(right.columns.unique()) - ): + if len(cols) and not (cols.equals(left.columns) and cols.equals(right.columns)): # TODO: is there a shortcut available when len(cols) == 0? return True From d71a499770ec03a987640f78ee2d370d538d9691 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 22 Nov 2020 01:19:13 +0100 Subject: [PATCH 12/20] Roll back to initial version --- pandas/core/ops/__init__.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index d024709feac25..fa4fff46196b0 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -307,14 +307,13 @@ def should_reindex_frame_op( if not isinstance(right, ABCDataFrame): return False - if not left.columns.is_unique or not right.columns.is_unique: - return False - if fill_value is None and level is None and axis is default_axis: # TODO: any other cases we should handle here? cols = left.columns.intersection(right.columns) - if len(cols) and not (cols.equals(left.columns) and cols.equals(right.columns)): + if len(cols) and not ( + cols.equals(left.columns.unique()) and cols.equals(right.columns.unique()) + ): # TODO: is there a shortcut available when len(cols) == 0? return True From d873d5a849868e3f4bee85a59263d3f07fd54fb7 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 22 Nov 2020 21:35:44 +0100 Subject: [PATCH 13/20] Change whatsnew --- doc/source/whatsnew/v1.2.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index c20fa0e5a4bb9..dbce8ffa7a415 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -627,7 +627,7 @@ MultiIndex - Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`) - Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`) - Bug in :meth:`MultiIndex.drop` dropped ``NaN`` values when non existing key was given as input (:issue:`18853`) -- Bug in :meth:`MultiIndex.intersection` returned duplicates when at least one of the indexes had duplicates (:issue:`36915`) +- Bug in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`) I/O ^^^ @@ -749,7 +749,7 @@ Other - Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError``, from a bare ``Exception`` previously (:issue:`35744`) - Bug in ``accessor.DirNamesMixin``, where ``dir(obj)`` wouldn't show attributes defined on the instance (:issue:`37173`). - Bug in :meth:`Series.nunique` with ``dropna=True`` was returning incorrect results when both ``NA`` and ``None`` missing values were present (:issue:`37566`) -- Bug in :meth:`Index.intersection` returned duplicates when at least one of the indexes had duplicates (:issue:`31326`) +- Bug in :meth:`Index.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`31326`) .. --------------------------------------------------------------------------- From e90239ad526d24d8f51a080ac1f8bcfc973ab9eb Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 27 Nov 2020 19:28:21 +0100 Subject: [PATCH 14/20] Move whatsnew --- doc/source/whatsnew/v1.1.5.rst | 2 +- doc/source/whatsnew/v1.2.0.rst | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 323342cb43950..d003063b062a7 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -17,7 +17,7 @@ Fixed regressions - Regression in addition of a timedelta-like scalar to a :class:`DatetimeIndex` raising incorrectly (:issue:`37295`) - Fixed regression in :meth:`Series.groupby` raising when the :class:`Index` of the :class:`Series` had a tuple as its name (:issue:`37755`) - Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` for ``__setitem__`` when one-dimensional tuple was given to select from :class:`MultiIndex` (:issue:`37711`) -- +- Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index dbce8ffa7a415..7ce9cfc7e9ec4 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -627,7 +627,6 @@ MultiIndex - Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`) - Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`) - Bug in :meth:`MultiIndex.drop` dropped ``NaN`` values when non existing key was given as input (:issue:`18853`) -- Bug in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`) I/O ^^^ From 742716e5c68b44b9718f3903830615ee7635ac23 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 27 Nov 2020 19:30:46 +0100 Subject: [PATCH 15/20] Change gh reference --- pandas/tests/indexes/multi/test_setops.py | 2 +- pandas/tests/indexes/test_setops.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 0c07f9c64c12d..e1185f6b1ba27 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -392,7 +392,7 @@ def test_setops_disallow_true(method): ], ) def test_intersect_with_duplicates(tuples, exp_tuples): - # GH: 36915 + # GH#36915 left = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) right = pd.MultiIndex.from_tuples( [("val1", "test1"), ("val1", "test1"), ("val2", "test2")], diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 659fba6f9cf72..2675c4569a8e9 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -122,7 +122,7 @@ def test_dunder_inplace_setops_deprecated(index): @pytest.mark.parametrize("values", [[1, 2, 2, 3], [3, 3]]) def test_intersection_duplicates(values): - # GH: 31326 + # GH#31326 a = pd.Index(values) b = pd.Index([3, 3]) result = a.intersection(b) From 321797a50d349f05f8a6686c51925df83e3f0fae Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 27 Nov 2020 19:31:38 +0100 Subject: [PATCH 16/20] Remove pd --- pandas/tests/indexes/multi/test_setops.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index e1185f6b1ba27..2ac57f1befd57 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -393,11 +393,11 @@ def test_setops_disallow_true(method): ) def test_intersect_with_duplicates(tuples, exp_tuples): # GH#36915 - left = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) - right = pd.MultiIndex.from_tuples( + left = MultiIndex.from_tuples(tuples, names=["first", "second"]) + right = MultiIndex.from_tuples( [("val1", "test1"), ("val1", "test1"), ("val2", "test2")], names=["first", "second"], ) result = left.intersection(right) - expected = pd.MultiIndex.from_tuples(exp_tuples, names=["first", "second"]) + expected = MultiIndex.from_tuples(exp_tuples, names=["first", "second"]) tm.assert_index_equal(result, expected) From a980ec08067bd7ec8a862bcbf086b47d4fd36987 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 28 Nov 2020 18:51:45 +0100 Subject: [PATCH 17/20] Remove whatsnew from 1.2 --- doc/source/whatsnew/v1.2.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b4af305383737..049ccc0e6c4df 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -774,7 +774,6 @@ Other - Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError`` rather than a bare ``Exception`` (:issue:`35744`) - Bug in ``dir`` where ``dir(obj)`` wouldn't show attributes defined on the instance for pandas objects (:issue:`37173`) - Bug in :meth:`RangeIndex.difference` returning :class:`Int64Index` in some cases where it should return :class:`RangeIndex` (:issue:`38028`) -- Bug in :meth:`Index.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`31326`) .. --------------------------------------------------------------------------- From 972fd482931d8257760f8b55752037db0b6f03c0 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 28 Nov 2020 18:53:03 +0100 Subject: [PATCH 18/20] Fix test --- pandas/tests/indexes/base_class/test_setops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 6413b110dff2e..ddcb3c5b87ebc 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -141,7 +141,7 @@ def test_intersection_str_dates(self, sort): @pytest.mark.parametrize( "index2,expected_arr", - [(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B", "A"])], + [(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B"])], ) def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort): # non-monotonic non-unique From fe1ded42428b7df0948fcc08b1ee7555bb31004c Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 28 Nov 2020 19:27:49 +0100 Subject: [PATCH 19/20] Make condition more clear and add assert --- pandas/core/indexes/base.py | 6 ++++-- pandas/core/ops/__init__.py | 7 ++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 503d0543ede28..fd82e0efce85e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2847,8 +2847,7 @@ def _intersection(self, other, sort=False): except TypeError: pass else: - result = algos.unique1d(result) - return result + return algos.unique1d(result) try: indexer = Index(rvals).get_indexer(lvals) @@ -2864,6 +2863,9 @@ def _intersection(self, other, sort=False): if sort is None: result = algos.safe_sort(result) + # Intersection has to be unique + assert np.array_equal(algos.unique(result), result) + return result def difference(self, other, sort=None): diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index fa4fff46196b0..d8b5dba424cbf 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -311,9 +311,10 @@ def should_reindex_frame_op( # TODO: any other cases we should handle here? cols = left.columns.intersection(right.columns) - if len(cols) and not ( - cols.equals(left.columns.unique()) and cols.equals(right.columns.unique()) - ): + # Intersection is always unique so we have to check the unique columns + left_uniques = left.columns.unique() + right_uniques = right.columns.unique() + if len(cols) and not (cols.equals(left_uniques) and cols.equals(right_uniques)): # TODO: is there a shortcut available when len(cols) == 0? return True From 8e4d47b7a4bcba35f2f947823bdb67c5e53af94e Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 28 Nov 2020 21:01:58 +0100 Subject: [PATCH 20/20] Use shape for equality check --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index fd82e0efce85e..f746f7a8adbca 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2864,7 +2864,7 @@ def _intersection(self, other, sort=False): result = algos.safe_sort(result) # Intersection has to be unique - assert np.array_equal(algos.unique(result), result) + assert algos.unique(result).shape == result.shape return result