From f930071cc6c57508e18ab5680690dbf48ddf97dd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 17 Sep 2022 16:08:51 +0200 Subject: [PATCH 1/8] BUG: MultiIndex.difference not keeping ea dtype --- asv_bench/benchmarks/multiindex_object.py | 2 +- pandas/core/indexes/base.py | 5 ++++- pandas/core/indexes/multi.py | 11 +++-------- pandas/tests/indexes/multi/test_setops.py | 21 +++++++++++++++++++++ 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 89d74e784b64c..221e3ca8471ec 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -238,7 +238,7 @@ class SetOperations: params = [ ("monotonic", "non_monotonic"), ("datetime", "int", "string"), - ("intersection", "union", "symmetric_difference"), + ("intersection", "union", "symmetric_difference", "difference"), ] param_names = ["index_structure", "dtype", "method"] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 52150eafd7783..e3bb7978a6911 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3640,7 +3640,10 @@ def _difference(self, other, sort): indexer = indexer.take((indexer != -1).nonzero()[0]) label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) - the_diff = this._values.take(label_diff) + if isinstance(this, ABCMultiIndex): + the_diff = this.take(label_diff) + else: + the_diff = this._values.take(label_diff) the_diff = _maybe_try_sort(the_diff, sort) return the_diff diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index dd63ea94d5211..df625a4b1027d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3730,18 +3730,13 @@ def _wrap_intersection_result(self, other, result) -> MultiIndex: else: return MultiIndex.from_arrays(zip(*result), sortorder=0, names=result_names) - def _wrap_difference_result(self, other, result) -> MultiIndex: + def _wrap_difference_result(self, other, result: MultiIndex) -> MultiIndex: _, result_names = self._convert_can_do_setop(other) if len(result) == 0: - return MultiIndex( - levels=[[]] * self.nlevels, - codes=[[]] * self.nlevels, - names=result_names, - verify_integrity=False, - ) + return result.remove_unused_levels().set_names(result_names) else: - return MultiIndex.from_tuples(result, sortorder=0, names=result_names) + return result.set_names(result_names) def _convert_can_do_setop(self, other): result_names = self.names diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index ce310a75e8e45..5d9083e43f983 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -440,6 +440,27 @@ def test_setops_disallow_true(method): getattr(idx1, method)(idx2, sort=True) +@pytest.mark.parametrize("val", [pd.NA, 100]) +def test_difference_keep_ea_dtypes(any_numeric_ea_dtype, val): + # GH# + midx = MultiIndex.from_arrays( + [Series([1, 2], dtype=any_numeric_ea_dtype), [2, 1]], names=["a", None] + ) + midx2 = MultiIndex.from_arrays( + [Series([1, 2, val], dtype=any_numeric_ea_dtype), [1, 1, 3]] + ) + result = midx.difference(midx2) + expected = MultiIndex.from_arrays([Series([1], dtype=any_numeric_ea_dtype), [2]]) + tm.assert_index_equal(result, expected) + + result = midx.difference(midx.sort_values(ascending=False)) + expected = MultiIndex.from_arrays( + [Series([], dtype=any_numeric_ea_dtype), Series([], dtype=int)], + names=["a", None], + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( ("tuples", "exp_tuples"), [ From 8640ee3e8481ea1fcbb2dc777b0842ee861192ae Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 17 Sep 2022 16:17:52 +0200 Subject: [PATCH 2/8] Add asv --- asv_bench/benchmarks/multiindex_object.py | 36 ++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 221e3ca8471ec..f5c3ce8087d29 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -238,7 +238,7 @@ class SetOperations: params = [ ("monotonic", "non_monotonic"), ("datetime", "int", "string"), - ("intersection", "union", "symmetric_difference", "difference"), + ("intersection", "union", "symmetric_difference"), ] param_names = ["index_structure", "dtype", "method"] @@ -272,6 +272,40 @@ def time_operation(self, index_structure, dtype, method): getattr(self.left, method)(self.right) +class Difference: + + params = [ + ("datetime", "int", "string"), + ] + param_names = ["dtype"] + + def setup(self, dtype): + N = 10**5 + level1 = range(1000) + + level2 = date_range(start="1/1/2000", periods=N // 1000) + dates_left = MultiIndex.from_product([level1, level2]) + + level2 = range(N // 1000) + int_left = MultiIndex.from_product([level1, level2]) + + level2 = tm.makeStringIndex(N // 1000).values + str_left = MultiIndex.from_product([level1, level2]) + + data = { + "datetime": dates_left, + "int": int_left, + "string": str_left, + } + + data = {k: {"left": mi, "right": mi[:5]} for k, mi in data.items()} + self.left = data[dtype]["left"] + self.right = data[dtype]["right"] + + def time_difference(self, dtype): + self.left.difference(self.right) + + class Unique: params = [ (("Int64", NA), ("int64", 0)), From e52941509b417f291bcc18062d9cbd8de2a1cc35 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 17 Sep 2022 16:18:43 +0200 Subject: [PATCH 3/8] Add whatsnew --- doc/source/whatsnew/v1.6.0.rst | 1 + pandas/tests/indexes/multi/test_setops.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 405b8cc0a5ded..3427b22d27679 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -175,6 +175,7 @@ Missing MultiIndex ^^^^^^^^^^ +- Bug in :meth:`MultiIndex.difference` losing extension array dtype (:issue:`48606`) - Bug in :meth:`MultiIndex.unique` losing extension array dtype (:issue:`48335`) - Bug in :meth:`MultiIndex.union` losing extension array (:issue:`48498`, :issue:`48505`) - Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 5d9083e43f983..ba9d02964e785 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -442,7 +442,7 @@ def test_setops_disallow_true(method): @pytest.mark.parametrize("val", [pd.NA, 100]) def test_difference_keep_ea_dtypes(any_numeric_ea_dtype, val): - # GH# + # GH#48606 midx = MultiIndex.from_arrays( [Series([1, 2], dtype=any_numeric_ea_dtype), [2, 1]], names=["a", None] ) From c8b6d846ffd10cd23a89a00af2597329bc2010b1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 17 Sep 2022 16:20:07 +0200 Subject: [PATCH 4/8] Reduce asv --- asv_bench/benchmarks/multiindex_object.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index f5c3ce8087d29..8295bf0e37b47 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -280,7 +280,7 @@ class Difference: param_names = ["dtype"] def setup(self, dtype): - N = 10**5 + N = 10**4 level1 = range(1000) level2 = date_range(start="1/1/2000", periods=N // 1000) From 51a6974e5c58f528b903556be8fa1ad63f170817 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 17 Sep 2022 16:23:21 +0200 Subject: [PATCH 5/8] Ad ea asv --- asv_bench/benchmarks/multiindex_object.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 8295bf0e37b47..eb56dd3a0bab4 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -275,12 +275,12 @@ def time_operation(self, index_structure, dtype, method): class Difference: params = [ - ("datetime", "int", "string"), + ("datetime", "int", "string", "ea_int"), ] param_names = ["dtype"] def setup(self, dtype): - N = 10**4 + N = 10**4 * 2 level1 = range(1000) level2 = date_range(start="1/1/2000", periods=N // 1000) @@ -289,12 +289,17 @@ def setup(self, dtype): level2 = range(N // 1000) int_left = MultiIndex.from_product([level1, level2]) + level2 = Series(range(N // 1000), dtype="Int64") + level2[0] = NA + ea_int_left = MultiIndex.from_product([level1, level2]) + level2 = tm.makeStringIndex(N // 1000).values str_left = MultiIndex.from_product([level1, level2]) data = { "datetime": dates_left, "int": int_left, + "ea_int": ea_int_left, "string": str_left, } From ceeca411d81bbe461391190461604b296a7ca4d6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 19 Sep 2022 00:04:00 +0200 Subject: [PATCH 6/8] Fix mypy --- pandas/core/indexes/base.py | 2 ++ pandas/core/indexes/multi.py | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e3bb7978a6911..959e20e04a6d6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3640,6 +3640,8 @@ def _difference(self, other, sort): indexer = indexer.take((indexer != -1).nonzero()[0]) label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) + + the_diff: MultiIndex | ArrayLike if isinstance(this, ABCMultiIndex): the_diff = this.take(label_diff) else: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index df625a4b1027d..b442dc3b3edf7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3734,9 +3734,13 @@ def _wrap_difference_result(self, other, result: MultiIndex) -> MultiIndex: _, result_names = self._convert_can_do_setop(other) if len(result) == 0: - return result.remove_unused_levels().set_names(result_names) + # Incompatible return value type (got "Optional[MultiIndex]", + # expected "MultiIndex") [return-value] + return result.remove_unused_levels().set_names( + result_names + ) # type: ignore[return-value] else: - return result.set_names(result_names) + return result.set_names(result_names) # type: ignore[return-value] def _convert_can_do_setop(self, other): result_names = self.names From 82903403d64ed4540628de60188a9ed6fae78279 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 19 Sep 2022 23:48:02 +0200 Subject: [PATCH 7/8] Add whatsnew --- doc/source/whatsnew/v1.6.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 3427b22d27679..1207fa540ca98 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -106,6 +106,7 @@ Performance improvements - Performance improvement in :meth:`.DataFrameGroupBy.median` and :meth:`.SeriesGroupBy.median` for nullable dtypes (:issue:`37493`) - Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`) - Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`48505`) +- Performance improvement in :meth:`MultiIndex.difference` (:issue:`48606`) - Performance improvement in :meth:`.DataFrameGroupBy.mean`, :meth:`.SeriesGroupBy.mean`, :meth:`.DataFrameGroupBy.var`, and :meth:`.SeriesGroupBy.var` for extension array dtypes (:issue:`37493`) - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) - Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`) From d48d6f34a5c1fa4e40a492888b0d3c8822a4adec Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 22 Sep 2022 13:42:51 -0700 Subject: [PATCH 8/8] Fix mypy --- pandas/core/indexes/multi.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 71e72e0358e97..5083454111275 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3731,13 +3731,9 @@ def _wrap_difference_result(self, other, result: MultiIndex) -> MultiIndex: _, result_names = self._convert_can_do_setop(other) if len(result) == 0: - # Incompatible return value type (got "Optional[MultiIndex]", - # expected "MultiIndex") [return-value] - return result.remove_unused_levels().set_names( - result_names - ) # type: ignore[return-value] + return result.remove_unused_levels().set_names(result_names) else: - return result.set_names(result_names) # type: ignore[return-value] + return result.set_names(result_names) def _convert_can_do_setop(self, other): result_names = self.names