diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 96fe90ce08c5f..8e0bffd7b8ece 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -693,20 +693,24 @@ def time_frame_sort_values(self, ascending): self.df.sort_values(by="A", ascending=ascending) -class SortIndexByColumns: +class SortMultiKey: def setup(self): N = 10000 K = 10 - self.df = DataFrame( + self.df_by_columns = DataFrame( { "key1": tm.makeStringIndex(N).values.repeat(K), "key2": tm.makeStringIndex(N).values.repeat(K), "value": np.random.randn(N * K), } ) + self.df_by_index = self.df_by_columns.set_index(["key1", "key2"]) + + def time_sort_values(self): + self.df_by_columns.sort_values(by=["key1", "key2"]) - def time_frame_sort_values_by_columns(self): - self.df.sort_values(by=["key1", "key2"]) + def time_sort_index(self): + self.df_by_index.sort_index() class Quantile: diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index e49436c4ed549..621c9159a5fe8 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -157,6 +157,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) - diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ae0ab66dd5f3e..bcb1392b5e7a7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2208,17 +2208,9 @@ def append(self, other): def argsort( self, *args, na_position: str = "last", **kwargs ) -> npt.NDArray[np.intp]: - if len(args) == 0 and len(kwargs) == 0: - # lexsort is significantly faster than self._values.argsort() - target = self._sort_levels_monotonic(raise_if_incomparable=True) - return lexsort_indexer( - # error: Argument 1 to "lexsort_indexer" has incompatible type - # "List[Categorical]"; expected "Union[List[Union[ExtensionArray, - # ndarray[Any, Any]]], List[Series]]" - target._get_codes_for_sorting(), # type: ignore[arg-type] - na_position=na_position, - ) - return self._values.argsort(*args, **kwargs) + target = self._sort_levels_monotonic(raise_if_incomparable=True) + keys = [lev.codes for lev in target._get_codes_for_sorting()] + return lexsort_indexer(keys, na_position=na_position, codes_given=True) @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats: int, axis=None) -> MultiIndex: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index bc6e29c3c7fbf..485644339414f 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -99,8 +99,9 @@ def get_indexer_indexer( na_position=na_position, ) elif isinstance(target, ABCMultiIndex): + codes = [lev.codes for lev in target._get_codes_for_sorting()] indexer = lexsort_indexer( - target.codes, orders=ascending, na_position=na_position, codes_given=True + codes, orders=ascending, na_position=na_position, codes_given=True ) else: # Check monotonic-ness before sort an index (GH 11080) @@ -298,22 +299,8 @@ def decons_obs_group_ids( return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels] -def indexer_from_factorized( - labels, shape: Shape, compress: bool = True -) -> npt.NDArray[np.intp]: - ids = get_group_index(labels, shape, sort=True, xnull=False) - - if not compress: - ngroups = (ids.size and ids.max()) + 1 - else: - ids, obs = compress_group_index(ids, sort=True) - ngroups = len(obs) - - return get_group_index_sorter(ids, ngroups) - - def lexsort_indexer( - keys: list[ArrayLike] | list[Series], + keys: Sequence[ArrayLike | Index | Series], orders=None, na_position: str = "last", key: Callable | None = None, @@ -324,9 +311,9 @@ def lexsort_indexer( Parameters ---------- - keys : list[ArrayLike] | list[Series] - Sequence of ndarrays to be sorted by the indexer - list[Series] is only if key is not None. + keys : Sequence[ArrayLike | Index | Series] + Sequence of arrays to be sorted by the indexer + Sequence[Series] is only if key is not None. orders : bool or list of booleans, optional Determines the sorting order for each element in keys. If a list, it must be the same length as keys. This determines whether the @@ -346,83 +333,38 @@ def lexsort_indexer( """ from pandas.core.arrays import Categorical - labels = [] - shape = [] + if na_position not in ["last", "first"]: + raise ValueError(f"invalid na_position: {na_position}") + if isinstance(orders, bool): orders = [orders] * len(keys) elif orders is None: orders = [True] * len(keys) - # error: Incompatible types in assignment (expression has type - # "List[Union[ExtensionArray, ndarray[Any, Any], Index, Series]]", variable - # has type "Union[List[Union[ExtensionArray, ndarray[Any, Any]]], List[Series]]") - keys = [ensure_key_mapped(k, key) for k in keys] # type: ignore[assignment] + labels = [] for k, order in zip(keys, orders): - if na_position not in ["last", "first"]: - raise ValueError(f"invalid na_position: {na_position}") - + k = ensure_key_mapped(k, key) if codes_given: - mask = k == -1 - codes = k.copy() - n = len(codes) - mask_n = n - # error: Item "ExtensionArray" of "Union[Any, ExtensionArray, - # ndarray[Any, Any]]" has no attribute "any" - if mask.any(): # type: ignore[union-attr] - n -= 1 - + codes = cast(np.ndarray, k) + n = codes.max() + 1 if len(codes) else 0 else: cat = Categorical(k, ordered=True) + codes = cat.codes n = len(cat.categories) - codes = cat.codes.copy() - mask = cat.codes == -1 - if mask.any(): - mask_n = n + 1 - else: - mask_n = n - - if order: # ascending - if na_position == "last": - # error: Argument 1 to "where" has incompatible type "Union[Any, - # ExtensionArray, ndarray[Any, Any]]"; expected - # "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], bool, int, float, - # complex, str, bytes, _NestedSequence[Union[bool, int, float, - # complex, str, bytes]]]" - codes = np.where(mask, n, codes) # type: ignore[arg-type] - elif na_position == "first": - # error: Incompatible types in assignment (expression has type - # "Union[Any, int, ndarray[Any, dtype[signedinteger[Any]]]]", - # variable has type "Union[Series, ExtensionArray, ndarray[Any, Any]]") - # error: Unsupported operand types for + ("ExtensionArray" and "int") - codes += 1 # type: ignore[operator,assignment] - else: # not order means descending - if na_position == "last": - # error: Unsupported operand types for - ("int" and "ExtensionArray") - # error: Argument 1 to "where" has incompatible type "Union[Any, - # ExtensionArray, ndarray[Any, Any]]"; expected - # "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], bool, int, float, - # complex, str, bytes, _NestedSequence[Union[bool, int, float, - # complex, str, bytes]]]" - codes = np.where( - mask, n, n - codes - 1 # type: ignore[operator,arg-type] - ) - elif na_position == "first": - # error: Unsupported operand types for - ("int" and "ExtensionArray") - # error: Argument 1 to "where" has incompatible type "Union[Any, - # ExtensionArray, ndarray[Any, Any]]"; expected - # "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], bool, int, float, - # complex, str, bytes, _NestedSequence[Union[bool, int, float, - # complex, str, bytes]]]" - codes = np.where(mask, 0, n - codes) # type: ignore[operator,arg-type] - - shape.append(mask_n) + + mask = codes == -1 + + if na_position == "last" and mask.any(): + codes = np.where(mask, n, codes) + + # not order means descending + if not order: + codes = np.where(mask, codes, n - codes - 1) + labels.append(codes) - return indexer_from_factorized(labels, tuple(shape)) + return np.lexsort(labels[::-1]) def nargsort(