From d0eea008acc3709742ca07c226bc76f19ba50da5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 22 Nov 2022 23:03:33 +0000 Subject: [PATCH 1/4] BUG: MultiIndex.putmask losing ea dtype --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/indexes/base.py | 1 - pandas/core/indexes/multi.py | 29 +++++++++++++++++++++ pandas/tests/indexes/multi/test_indexing.py | 28 ++++++++++++++++++++ 4 files changed, 58 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 02ea290995c8d..9814d684c7f77 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -700,6 +700,7 @@ MultiIndex - Bug in :meth:`MultiIndex.union` not sorting when sort=None and index contains missing values (:issue:`49010`) - Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`) - Bug in :meth:`MultiIndex.symmetric_difference` losing extension array (:issue:`48607`) +- Bug in :meth:`MultiIndex.putmask` losing extension array (:issue:`49830`) - Bug in :meth:`MultiIndex.value_counts` returning a :class:`Series` indexed by flat index of tuples instead of a :class:`MultiIndex` (:issue:`49558`) - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0bc568fb122ed..9c1f5fea7603d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5159,7 +5159,6 @@ def _concat(self, to_concat: list[Index], name: Hashable) -> Index: return Index._with_infer(result, name=name) - @final def putmask(self, mask, value) -> Index: """ Return a new Index of the values set with the mask. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 04a57c1709382..115b4c31008b6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -79,6 +79,7 @@ ) import pandas.core.algorithms as algos +from pandas.core.array_algos.putmask import validate_putmask from pandas.core.arrays import Categorical from pandas.core.arrays.categorical import factorize_from_iterables import pandas.core.common as com @@ -3659,6 +3660,34 @@ def _validate_fill_value(self, item): raise ValueError("Item must have length equal to number of levels.") return item + def putmask(self, mask, value: MultiIndex) -> Index: + mask, noop = validate_putmask(self, mask) + if noop: + return self.copy() + + if len(mask) == len(value): + subset = value[mask].remove_unused_levels() + else: + subset = value.remove_unused_levels() + + new_levels = [] + new_codes = [] + + for i, (value_level, level, level_codes) in enumerate( + zip(subset.levels, self.levels, self.codes) + ): + new_elements = value_level.difference(level) + new_level = level.append(new_elements) + value_codes = new_level.get_indexer_for(subset.get_level_values(i)) + new_code = ensure_int64(level_codes) + new_code[mask] = value_codes + new_levels.append(new_level) + new_codes.append(new_code) + + return MultiIndex( + levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False + ) + def insert(self, loc: int, item) -> MultiIndex: """ Make new MultiIndex inserting new item at location diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 552b3753083fe..4c879c8ff5736 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -162,6 +162,34 @@ def test_putmask_multiindex_other(self): expected = MultiIndex.from_tuples([right[0], right[1], left[2]]) tm.assert_index_equal(result, expected) + def test_putmask_keep_dtype(self, any_numeric_ea_dtype): + # GH#49830 + midx = MultiIndex.from_arrays( + [pd.Series([1, 2, 3], dtype=any_numeric_ea_dtype), [10, 11, 12]] + ) + midx2 = MultiIndex.from_arrays( + [pd.Series([5, 6, 7], dtype=any_numeric_ea_dtype), [-1, -2, -3]] + ) + result = midx.putmask([True, False, False], midx2) + expected = MultiIndex.from_arrays( + [pd.Series([5, 2, 3], dtype=any_numeric_ea_dtype), [-1, 11, 12]] + ) + tm.assert_index_equal(result, expected) + + def test_putmask_keep_dtype_shorter_value(self, any_numeric_ea_dtype): + # GH#49830 + midx = MultiIndex.from_arrays( + [pd.Series([1, 2, 3], dtype=any_numeric_ea_dtype), [10, 11, 12]] + ) + midx2 = MultiIndex.from_arrays( + [pd.Series([5], dtype=any_numeric_ea_dtype), [-1]] + ) + result = midx.putmask([True, False, False], midx2) + expected = MultiIndex.from_arrays( + [pd.Series([5, 2, 3], dtype=any_numeric_ea_dtype), [-1, 11, 12]] + ) + tm.assert_index_equal(result, expected) + class TestGetIndexer: def test_get_indexer(self): From b73beee00381de87ace7bd9ce5d5df3113881d16 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 23 Nov 2022 00:33:50 +0000 Subject: [PATCH 2/4] Fix typing --- pandas/core/array_algos/putmask.py | 10 ++++++++-- pandas/core/indexes/multi.py | 15 ++++++++++++++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index 17622e78d1b12..3e2c711d12f26 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -3,7 +3,10 @@ """ from __future__ import annotations -from typing import Any +from typing import ( + TYPE_CHECKING, + Any, +) import numpy as np @@ -19,6 +22,9 @@ from pandas.core.arrays import ExtensionArray +if TYPE_CHECKING: + from pandas import MultiIndex + def putmask_inplace(values: ArrayLike, mask: npt.NDArray[np.bool_], value: Any) -> None: """ @@ -96,7 +102,7 @@ def putmask_without_repeat( def validate_putmask( - values: ArrayLike, mask: np.ndarray + values: ArrayLike | MultiIndex, mask: np.ndarray ) -> tuple[npt.NDArray[np.bool_], bool]: """ Validate mask and check if this putmask operation is a no-op. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 115b4c31008b6..3841fe08a444c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3660,7 +3660,20 @@ def _validate_fill_value(self, item): raise ValueError("Item must have length equal to number of levels.") return item - def putmask(self, mask, value: MultiIndex) -> Index: + def putmask(self, mask, value: MultiIndex) -> MultiIndex: + """ + Return a new MultiIndex of the values set with the mask. + + Parameters + ---------- + mask : array like + value : MultiIndex + Must either be the same length as self or length one + + Returns + ------- + MultiIndex + """ mask, noop = validate_putmask(self, mask) if noop: return self.copy() From 55f6ad266422c9b0bab310d607d3fd3946f8faac Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 23 Nov 2022 15:20:23 +0000 Subject: [PATCH 3/4] Add asv --- asv_bench/benchmarks/multiindex_object.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 97c710be6d5a1..de55268e0407b 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -379,4 +379,26 @@ def time_isin_large(self, dtype): self.midx.isin(self.values_large) +class Putmask: + def setup(self): + N = 10**5 + level1 = range(1_000) + + level2 = date_range(start="1/1/2000", periods=N // 1000) + self.midx = MultiIndex.from_product([level1, level2]) + + level1 = range(1_000, 2_000) + self.midx_values = MultiIndex.from_product([level1, level2]) + + level2 = date_range(start="1/1/2010", periods=N // 1000) + self.midx_values_different = MultiIndex.from_product([level1, level2]) + self.mask = np.array([True, False] * (N // 2)) + + def time_putmask(self): + self.midx.putmask(self.mask, self.midx_values) + + def time_putmask_all_different(self): + self.midx.putmask(self.mask, self.midx_values_different) + + from .pandas_vb_common import setup # noqa: F401 isort:skip From 6bf213d682cbb7cd7a44f55cdcd0776d6fdd2974 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 23 Nov 2022 22:06:50 +0000 Subject: [PATCH 4/4] Simplify and add whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/indexes/multi.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 9814d684c7f77..14cd13fb362d6 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -586,6 +586,7 @@ Performance improvements - Performance improvement in :class:`MultiIndex` set operations with sort=None (:issue:`49010`) - Performance improvement in :meth:`.DataFrameGroupBy.mean`, :meth:`.SeriesGroupBy.mean`, :meth:`.DataFrameGroupBy.var`, and :meth:`.SeriesGroupBy.var` for extension array dtypes (:issue:`37493`) - Performance improvement in :meth:`MultiIndex.isin` when ``level=None`` (:issue:`48622`, :issue:`49577`) +- Performance improvement in :meth:`MultiIndex.putmask` (:issue:`49830`) - Performance improvement in :meth:`Index.union` and :meth:`MultiIndex.union` when index contains duplicates (:issue:`48900`) - Performance improvement in :meth:`Series.fillna` for pyarrow-backed dtypes (:issue:`49722`) - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3841fe08a444c..eba1993afa26a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3689,8 +3689,7 @@ def putmask(self, mask, value: MultiIndex) -> MultiIndex: for i, (value_level, level, level_codes) in enumerate( zip(subset.levels, self.levels, self.codes) ): - new_elements = value_level.difference(level) - new_level = level.append(new_elements) + new_level = level.union(value_level, sort=False) value_codes = new_level.get_indexer_for(subset.get_level_values(i)) new_code = ensure_int64(level_codes) new_code[mask] = value_codes