From 54ae9c52a4a0315b0cc8e846e34ae4fbea3025b2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 21 Aug 2024 12:16:58 -0700 Subject: [PATCH 1/2] PERF: Only clear cached .levels when setting MultiIndex.names --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/indexes/multi.py | 24 +++++++++---------- .../multiindex/test_chaining_and_caching.py | 17 +++++++++++++ 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index bdeb9c48990a2..78b7651dd250d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -508,6 +508,7 @@ Performance improvements - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`Index.to_frame` returning a :class:`RangeIndex` columns of a :class:`Index` when possible. (:issue:`58018`) - Performance improvement in :meth:`MultiIndex._engine` to use smaller dtypes if possible (:issue:`58411`) +- Performance improvement in :class:`MultiIndex` when setting :attr:`MultiIndex.names` doesn't invalidate all cached operations (:issue:`?`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) - Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`) - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c3d4ad721c830..166ddc9524a9e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -799,7 +799,7 @@ def dtypes(self) -> Series: """ from pandas import Series - names = com.fill_missing_names([level.name for level in self.levels]) + names = com.fill_missing_names(self.names) return Series([level.dtype for level in self.levels], index=Index(names)) def __len__(self) -> int: @@ -1572,7 +1572,7 @@ def _format_multi( def _get_names(self) -> FrozenList: return FrozenList(self._names) - def _set_names(self, names, *, level=None, validate: bool = True) -> None: + def _set_names(self, names, *, level=None) -> None: """ Set new names on index. Each name has to be a hashable type. @@ -1583,8 +1583,6 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None: level : int, level name, or sequence of int/level names (default None) If the index is a MultiIndex (hierarchical), level(s) to set (None for all levels). Otherwise level must be None - validate : bool, default True - validate that the names match level lengths Raises ------ @@ -1603,13 +1601,12 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None: raise ValueError("Names should be list-like for a MultiIndex") names = list(names) - if validate: - if level is not None and len(names) != len(level): - raise ValueError("Length of names must match length of level.") - if level is None and len(names) != self.nlevels: - raise ValueError( - "Length of names must match number of levels in MultiIndex." - ) + if level is not None and len(names) != len(level): + raise ValueError("Length of names must match length of level.") + if level is None and len(names) != self.nlevels: + raise ValueError( + "Length of names must match number of levels in MultiIndex." + ) if level is None: level = range(self.nlevels) @@ -1627,8 +1624,9 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None: ) self._names[lev] = name - # If .levels has been accessed, the names in our cache will be stale. - self._reset_cache() + # If .levels has been accessed, the .name of each level in our cache + # will be stale. + self._reset_cache("levels") names = property( fset=_set_names, diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index 43aec12055cd2..c7ed21a2cc001 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -5,6 +5,7 @@ from pandas import ( DataFrame, MultiIndex, + RangeIndex, Series, ) import pandas._testing as tm @@ -68,3 +69,19 @@ def test_indexer_caching(monkeypatch): s[s == 0] = 1 expected = Series(np.ones(size_cutoff), index=index) tm.assert_series_equal(s, expected) + + +def test_set_names_only_clears_level_cache(): + mi = MultiIndex.from_arrays([range(4), range(4)], names=["a", "b"]) + mi.dtypes + mi.is_monotonic_increasing + mi._engine + mi.levels + old_cache_keys = sorted(mi._cache.keys()) + assert old_cache_keys == ["_engine", "dtypes", "is_monotonic_increasing", "levels"] + mi.names = ["A", "B"] + new_cache_keys = sorted(mi._cache.keys()) + assert new_cache_keys == ["_engine", "dtypes", "is_monotonic_increasing"] + new_levels = mi.levels + tm.assert_index_equal(new_levels[0], RangeIndex(4, name="A")) + tm.assert_index_equal(new_levels[1], RangeIndex(4, name="B")) From 931cc7653f66cc27588d14b992a5c7e742057343 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 21 Aug 2024 12:25:29 -0700 Subject: [PATCH 2/2] whatsnew number --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 78b7651dd250d..24e1f641c7925 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -500,6 +500,7 @@ Performance improvements - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) - :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`) - Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`) +- Performance improvement in :class:`MultiIndex` when setting :attr:`MultiIndex.names` doesn't invalidate all cached operations (:issue:`59578`) - Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`) - Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`) - Performance improvement in :meth:`DataFrame.join` with ``how="left"`` or ``how="right"`` and ``sort=True`` (:issue:`56919`) @@ -508,7 +509,6 @@ Performance improvements - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`Index.to_frame` returning a :class:`RangeIndex` columns of a :class:`Index` when possible. (:issue:`58018`) - Performance improvement in :meth:`MultiIndex._engine` to use smaller dtypes if possible (:issue:`58411`) -- Performance improvement in :class:`MultiIndex` when setting :attr:`MultiIndex.names` doesn't invalidate all cached operations (:issue:`?`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) - Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`) - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`)