From b8644cb82005a5722ffd7e96356e6ea9d43cd045 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Sun, 6 Mar 2022 00:16:32 +0800 Subject: [PATCH 01/39] cannot construct Index of empty tuples --- pandas/core/indexes/base.py | 3 +-- pandas/tests/indexes/base_class/test_constructors.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 971ee38005673..86e448979ec0b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -531,8 +531,7 @@ def __new__( elif hasattr(data, "__array__"): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) else: - - if tupleize_cols and is_list_like(data): + if tupleize_cols and is_list_like(data) and all(data): # GH21470: convert iterable to list before determining if empty if is_iterator(data): data = list(data) diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index bc894579340ab..c40948fb12b41 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -40,3 +40,13 @@ def test_constructor_cast(self): msg = "could not convert string to float" with pytest.raises(ValueError, match=msg): Index(["a", "b", "c"], dtype=float) + + def test_construct_empty_tuples(self): + # GH #45608 + result = Index([()]) + expected = Index([()], dtype="object") + tm.assert_index_equal(result, expected) + + result = Index([(), None]) + expected = Index([(), None], dtype="object") + tm.assert_index_equal(result, expected) From 83cd66aec8f19033d0987bc414db95967dae2e98 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Sun, 6 Mar 2022 15:56:27 +0800 Subject: [PATCH 02/39] test case failed --- pandas/core/indexes/base.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 86e448979ec0b..2383b78b672ed 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -531,7 +531,7 @@ def __new__( elif hasattr(data, "__array__"): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) else: - if tupleize_cols and is_list_like(data) and all(data): + if tupleize_cols and is_list_like(data): # GH21470: convert iterable to list before determining if empty if is_iterator(data): data = list(data) @@ -541,9 +541,10 @@ def __new__( # 10697 from pandas.core.indexes.multi import MultiIndex - return MultiIndex.from_tuples( - data, names=name or kwargs.get("names") - ) + if all(data): + return MultiIndex.from_tuples( + data, names=name or kwargs.get("names") + ) # other iterable of some kind subarr = com.asarray_tuplesafe(data, dtype=_dtype_obj) From ae1414f205299b43963d27fe827e733e287426dc Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Tue, 8 Mar 2022 20:56:46 +0800 Subject: [PATCH 03/39] make change --- pandas/core/indexes/base.py | 13 +++++-------- pandas/core/indexes/multi.py | 6 ++++++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2383b78b672ed..d717e5cfb1083 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -531,6 +531,7 @@ def __new__( elif hasattr(data, "__array__"): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) else: + if tupleize_cols and is_list_like(data): # GH21470: convert iterable to list before determining if empty if is_iterator(data): @@ -541,10 +542,9 @@ def __new__( # 10697 from pandas.core.indexes.multi import MultiIndex - if all(data): - return MultiIndex.from_tuples( - data, names=name or kwargs.get("names") - ) + return MultiIndex.from_tuples( + data, names=name or kwargs.get("names") + ) # other iterable of some kind subarr = com.asarray_tuplesafe(data, dtype=_dtype_obj) @@ -3932,10 +3932,7 @@ def _get_indexer( else: tgt_values = target._get_engine_target() - # error: Argument 1 to "get_indexer" of "IndexEngine" has incompatible - # type "Union[ExtensionArray, ndarray[Any, Any]]"; expected - # "ndarray[Any, Any]" - indexer = self._engine.get_indexer(tgt_values) # type: ignore[arg-type] + indexer = self._engine.get_indexer(tgt_values) return ensure_platform_int(indexer) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f40857059a794..452ebb67bfba5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -541,6 +541,12 @@ def from_tuples( (2, 'blue')], names=['number', 'color']) """ + tuples = list(tuples) + if not all(tuples): + _dtype_obj = np.dtype("object") + subarr = com.asarray_tuplesafe(tuples, dtype=_dtype_obj) + return Index(subarr, dtype=_dtype_obj) + if not is_list_like(tuples): raise TypeError("Input must be a list / sequence of tuple-likes.") elif is_iterator(tuples): From 2d15d491d6dc77fb8d756533edc301b18c78aae8 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Tue, 8 Mar 2022 22:58:12 +0800 Subject: [PATCH 04/39] change tuple --- pandas/core/indexes/multi.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 452ebb67bfba5..c473447dd3b5e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -541,12 +541,6 @@ def from_tuples( (2, 'blue')], names=['number', 'color']) """ - tuples = list(tuples) - if not all(tuples): - _dtype_obj = np.dtype("object") - subarr = com.asarray_tuplesafe(tuples, dtype=_dtype_obj) - return Index(subarr, dtype=_dtype_obj) - if not is_list_like(tuples): raise TypeError("Input must be a list / sequence of tuple-likes.") elif is_iterator(tuples): @@ -569,6 +563,13 @@ def from_tuples( arrs = zip(*tuples) arrays = cast(List[Sequence[Hashable]], arrs) + tuples = list(tuples) + + if not all(tuples): + _dtype_obj = np.dtype("object") + subarr = com.asarray_tuplesafe(tuples, dtype=_dtype_obj) + return Index(subarr, dtype=_dtype_obj) + return cls.from_arrays(arrays, sortorder=sortorder, names=names) @classmethod From 7ef7c86cbf0872dd3cd386e0b654636cb507c2de Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Wed, 9 Mar 2022 21:52:03 +0800 Subject: [PATCH 05/39] multi index --- pandas/core/indexes/multi.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 590f0532b90fb..688ceb15fcc76 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -564,11 +564,15 @@ def from_tuples( arrays = cast(List[Sequence[Hashable]], arrs) tuples = list(tuples) + if all(isinstance(item, tuple) for item in tuples): + tuples_bool = [len(tuple_obj) == 0 for tuple_obj in tuples] - if not all(tuples): - _dtype_obj = np.dtype("object") - subarr = com.asarray_tuplesafe(tuples, dtype=_dtype_obj) - return Index(subarr, dtype=_dtype_obj) + if np.array(tuples_bool).all(): + + _dtype_obj = np.dtype("object") + subarr = com.asarray_tuplesafe(tuples, dtype=_dtype_obj) + + return Index(subarr, dtype=_dtype_obj) return cls.from_arrays(arrays, sortorder=sortorder, names=names) From ccd54798a5d24bdb12499fe190cbd62b6ba6e613 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Thu, 10 Mar 2022 00:31:28 +0800 Subject: [PATCH 06/39] multi index --- pandas/core/indexes/multi.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 688ceb15fcc76..299aa9026766a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -563,15 +563,17 @@ def from_tuples( arrs = zip(*tuples) arrays = cast(List[Sequence[Hashable]], arrs) - tuples = list(tuples) - if all(isinstance(item, tuple) for item in tuples): + tuples_list = list(tuples) + + if len(tuples_list) != 0 and all( + isinstance(item, tuple) for item in tuples_list + ): tuples_bool = [len(tuple_obj) == 0 for tuple_obj in tuples] if np.array(tuples_bool).all(): _dtype_obj = np.dtype("object") subarr = com.asarray_tuplesafe(tuples, dtype=_dtype_obj) - return Index(subarr, dtype=_dtype_obj) return cls.from_arrays(arrays, sortorder=sortorder, names=names) From 6915cc107caca0a24c5eb5ee6f75d617e784aec1 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Thu, 10 Mar 2022 21:26:36 +0800 Subject: [PATCH 07/39] change to index --- pandas/core/indexes/base.py | 4 ++++ pandas/core/indexes/multi.py | 13 ------------- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d717e5cfb1083..447d59fd118bb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -542,9 +542,13 @@ def __new__( # 10697 from pandas.core.indexes.multi import MultiIndex + if not all(data): + subarr = com.asarray_tuplesafe(data, dtype=_dtype_obj) + return Index(subarr, dtype=dtype) return MultiIndex.from_tuples( data, names=name or kwargs.get("names") ) + # other iterable of some kind subarr = com.asarray_tuplesafe(data, dtype=_dtype_obj) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 299aa9026766a..cdde510927081 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -563,19 +563,6 @@ def from_tuples( arrs = zip(*tuples) arrays = cast(List[Sequence[Hashable]], arrs) - tuples_list = list(tuples) - - if len(tuples_list) != 0 and all( - isinstance(item, tuple) for item in tuples_list - ): - tuples_bool = [len(tuple_obj) == 0 for tuple_obj in tuples] - - if np.array(tuples_bool).all(): - - _dtype_obj = np.dtype("object") - subarr = com.asarray_tuplesafe(tuples, dtype=_dtype_obj) - return Index(subarr, dtype=_dtype_obj) - return cls.from_arrays(arrays, sortorder=sortorder, names=names) @classmethod From 44adae2aa28126c604b3a9b040ad0f07b1ebac1f Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Sat, 12 Mar 2022 17:10:01 +0800 Subject: [PATCH 08/39] from tuples --- pandas/core/indexes/base.py | 3 --- pandas/core/indexes/multi.py | 6 ++++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1ef7a10e1e161..d8739425f69bb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -542,9 +542,6 @@ def __new__( # 10697 from pandas.core.indexes.multi import MultiIndex - if not all(data): - subarr = com.asarray_tuplesafe(data, dtype=_dtype_obj) - return Index(subarr, dtype=dtype) return MultiIndex.from_tuples( data, names=name or kwargs.get("names") ) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 272c4ac596755..7531eef682d25 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -563,6 +563,12 @@ def from_tuples( arrs = zip(*tuples) arrays = cast(List[Sequence[Hashable]], arrs) + if not all(tuples): + _dtype_obj = np.dtype("object") + subarr = com.asarray_tuplesafe(tuples, dtype=_dtype_obj) + dtype = subarr.dtype + return Index(subarr, dtype=dtype) + return cls.from_arrays(arrays, sortorder=sortorder, names=names) @classmethod From 1283e970d2ab790e078216ba4cc97329698bbad9 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Sat, 12 Mar 2022 17:11:14 +0800 Subject: [PATCH 09/39] merge --- pandas/core/indexes/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d8739425f69bb..c673848bc022a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -545,7 +545,6 @@ def __new__( return MultiIndex.from_tuples( data, names=name or kwargs.get("names") ) - # other iterable of some kind subarr = com.asarray_tuplesafe(data, dtype=_dtype_obj) From 79bd8eac12b4ebea80b384b2a8a40ffe692ef317 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Sat, 12 Mar 2022 18:26:21 +0800 Subject: [PATCH 10/39] ensure function return multiindex type --- pandas/core/indexes/multi.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7531eef682d25..4980271347b1e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -565,9 +565,7 @@ def from_tuples( if not all(tuples): _dtype_obj = np.dtype("object") - subarr = com.asarray_tuplesafe(tuples, dtype=_dtype_obj) - dtype = subarr.dtype - return Index(subarr, dtype=dtype) + arrays = com.asarray_tuplesafe(tuples, dtype=_dtype_obj) return cls.from_arrays(arrays, sortorder=sortorder, names=names) From 4fdae77874602e514d109d07de82551e0ed16f62 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Sat, 12 Mar 2022 18:54:58 +0800 Subject: [PATCH 11/39] Incompatible types in assignment --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4980271347b1e..2da885701b807 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -565,7 +565,7 @@ def from_tuples( if not all(tuples): _dtype_obj = np.dtype("object") - arrays = com.asarray_tuplesafe(tuples, dtype=_dtype_obj) + arrays = com.asarray_tuplesafe(np.array(tuples), dtype=_dtype_obj) return cls.from_arrays(arrays, sortorder=sortorder, names=names) From 1ee00a1b3c150310599a8a97964fe2ef83ea24ac Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Sat, 12 Mar 2022 20:08:24 +0800 Subject: [PATCH 12/39] mypy --- pandas/core/indexes/multi.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2da885701b807..6403e08825606 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -564,8 +564,7 @@ def from_tuples( arrays = cast(List[Sequence[Hashable]], arrs) if not all(tuples): - _dtype_obj = np.dtype("object") - arrays = com.asarray_tuplesafe(np.array(tuples), dtype=_dtype_obj) + return cls.from_arrays(tuples, sortorder=sortorder, names=names) return cls.from_arrays(arrays, sortorder=sortorder, names=names) From 4dcc06d02313eb79cd621d886f86636582023651 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Sat, 12 Mar 2022 20:59:16 +0800 Subject: [PATCH 13/39] check tuple --- pandas/core/indexes/multi.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6403e08825606..90701b679d6eb 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -563,8 +563,9 @@ def from_tuples( arrs = zip(*tuples) arrays = cast(List[Sequence[Hashable]], arrs) - if not all(tuples): - return cls.from_arrays(tuples, sortorder=sortorder, names=names) + if tuples and all(isinstance(e, tuple) for e in tuples): + if not all(tuples): + return cls.from_arrays(tuples, sortorder=sortorder, names=names) return cls.from_arrays(arrays, sortorder=sortorder, names=names) From 0323e0ee880d743c09da92f34db9045447afa318 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Sat, 12 Mar 2022 22:13:37 +0800 Subject: [PATCH 14/39] condition --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 90701b679d6eb..f998ff73d0f22 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -563,7 +563,7 @@ def from_tuples( arrs = zip(*tuples) arrays = cast(List[Sequence[Hashable]], arrs) - if tuples and all(isinstance(e, tuple) for e in tuples): + if all(isinstance(e, tuple) for e in tuples): if not all(tuples): return cls.from_arrays(tuples, sortorder=sortorder, names=names) From b8f25363cfed237e9f86cadb837a8b9436b747bd Mon Sep 17 00:00:00 2001 From: Khor Chean Wei Date: Tue, 15 Mar 2022 22:24:51 +0800 Subject: [PATCH 15/39] Multi index --- pandas/core/indexes/multi.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f998ff73d0f22..1aa8f6aa19cde 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -342,9 +342,7 @@ def __new__( if verify_integrity: new_codes = result._verify_integrity() result._codes = new_codes - result._reset_identity() - return result def _validate_codes(self, level: list, code: list): @@ -487,6 +485,13 @@ def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> MultiIndex raise ValueError("all arrays must be same length") codes, levels = factorize_from_iterables(arrays) + + if all(isinstance(e, tuple) for e in arrays): + codes = [np.array([i for i in range(len(arrays))])] + _dtype_obj = np.dtype("object") + subarr = com.asarray_tuplesafe(arrays, dtype=_dtype_obj) + levels = [Index(subarr)] + if names is lib.no_default: names = [getattr(arr, "name", None) for arr in arrays] From 68765e6d163631edb8211cf4fecf7b8fb91142db Mon Sep 17 00:00:00 2001 From: Khor Chean Wei Date: Tue, 15 Mar 2022 22:26:32 +0800 Subject: [PATCH 16/39] Update multi.py --- pandas/core/indexes/multi.py | 4424 +++++----------------------------- 1 file changed, 615 insertions(+), 3809 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1aa8f6aa19cde..e39ebd3afd2ff 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,4000 +1,806 @@ from __future__ import annotations -from functools import wraps -from sys import getsizeof +from collections.abc import Callable # noqa: PDF001 +import re from typing import ( TYPE_CHECKING, Any, - Callable, - Collection, - Hashable, - Iterable, - List, - Sequence, - Tuple, + Union, cast, + overload, ) -import warnings import numpy as np -from pandas._config import get_option - from pandas._libs import ( - algos as libalgos, - index as libindex, lib, + missing as libmissing, ) -from pandas._libs.hashtable import duplicated from pandas._typing import ( - AnyArrayLike, - DtypeObj, - F, + Dtype, + NpDtype, + PositionalIndexer, Scalar, - Shape, + ScalarIndexer, + SequenceIndexer, + TakeIndexer, npt, ) -from pandas.compat.numpy import function as nv -from pandas.errors import ( - InvalidIndexError, - PerformanceWarning, - UnsortedIndexError, -) -from pandas.util._decorators import ( - Appender, - cache_readonly, - deprecate_nonkeyword_arguments, - doc, +from pandas.compat import ( + pa_version_under1p01, + pa_version_under2p0, + pa_version_under3p0, + pa_version_under4p0, ) -from pandas.util._exceptions import find_stack_level +from pandas.util._decorators import doc -from pandas.core.dtypes.cast import coerce_indexer_dtype from pandas.core.dtypes.common import ( - ensure_int64, - ensure_platform_int, - is_categorical_dtype, - is_hashable, + is_array_like, + is_bool_dtype, + is_dtype_equal, is_integer, - is_iterator, - is_list_like, + is_integer_dtype, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCDatetimeIndex, - ABCTimedeltaIndex, -) -from pandas.core.dtypes.missing import ( - array_equivalent, - isna, -) - -import pandas.core.algorithms as algos -from pandas.core.arrays import Categorical -from pandas.core.arrays.categorical import factorize_from_iterables -import pandas.core.common as com -import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import ( - Index, - _index_shared_docs, - ensure_index, - get_unanimous_names, -) -from pandas.core.indexes.frozen import FrozenList -from pandas.core.indexes.numeric import Int64Index -from pandas.core.ops.invalid import make_invalid_op -from pandas.core.sorting import ( - get_group_index, - indexer_from_factorized, - lexsort_indexer, +from pandas.core.dtypes.missing import isna + +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._mixins import ArrowExtensionArray +from pandas.core.arrays.base import ExtensionArray +from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.integer import Int64Dtype +from pandas.core.arrays.numeric import NumericDtype +from pandas.core.arrays.string_ import ( + BaseStringArray, + StringDtype, ) - -from pandas.io.formats.printing import pprint_thing - -if TYPE_CHECKING: - from pandas import ( - CategoricalIndex, - DataFrame, - Series, - ) - -_index_doc_kwargs = dict(ibase._index_doc_kwargs) -_index_doc_kwargs.update( - {"klass": "MultiIndex", "target_klass": "MultiIndex or list of tuples"} +from pandas.core.indexers import ( + check_array_indexer, + unpack_tuple_and_ellipses, + validate_indices, ) +from pandas.core.strings.object_array import ObjectStringArrayMixin +if not pa_version_under1p01: + import pyarrow as pa + import pyarrow.compute as pc -class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): - """ - This class manages a MultiIndex by mapping label combinations to positive - integers. - """ - - _base = libindex.UInt64Engine - - def _codes_to_ints(self, codes): - """ - Transform combination(s) of uint64 in one uint64 (each), in a strictly - monotonic way (i.e. respecting the lexicographic order of integer - combinations): see BaseMultiIndexCodesEngine documentation. - - Parameters - ---------- - codes : 1- or 2-dimensional array of dtype uint64 - Combinations of integers (one per row) - - Returns - ------- - scalar or 1-dimensional array, of dtype uint64 - Integer(s) representing one combination (each). - """ - # Shift the representation of each level by the pre-calculated number - # of bits: - codes <<= self.offsets - - # Now sum and OR are in fact interchangeable. This is a simple - # composition of the (disjunct) significant bits of each level (i.e. - # each column in "codes") in a single positive integer: - if codes.ndim == 1: - # Single key - return np.bitwise_or.reduce(codes) - - # Multiple keys - return np.bitwise_or.reduce(codes, axis=1) - + ARROW_CMP_FUNCS = { + "eq": pc.equal, + "ne": pc.not_equal, + "lt": pc.less, + "gt": pc.greater, + "le": pc.less_equal, + "ge": pc.greater_equal, + } -class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine): - """ - This class manages those (extreme) cases in which the number of possible - label combinations overflows the 64 bits integers, and uses an ObjectEngine - containing Python integers. - """ - _base = libindex.ObjectEngine - - def _codes_to_ints(self, codes): - """ - Transform combination(s) of uint64 in one Python integer (each), in a - strictly monotonic way (i.e. respecting the lexicographic order of - integer combinations): see BaseMultiIndexCodesEngine documentation. +if TYPE_CHECKING: + from pandas import Series - Parameters - ---------- - codes : 1- or 2-dimensional array of dtype uint64 - Combinations of integers (one per row) +ArrowStringScalarOrNAT = Union[str, libmissing.NAType] - Returns - ------- - int, or 1-dimensional array of dtype object - Integer(s) representing one combination (each). - """ - # Shift the representation of each level by the pre-calculated number - # of bits. Since this can overflow uint64, first make sure we are - # working with Python integers: - codes = codes.astype("object") << self.offsets - # Now sum and OR are in fact interchangeable. This is a simple - # composition of the (disjunct) significant bits of each level (i.e. - # each column in "codes") in a single positive integer (per row): - if codes.ndim == 1: - # Single key - return np.bitwise_or.reduce(codes) +def _chk_pyarrow_available() -> None: + if pa_version_under1p01: + msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray." + raise ImportError(msg) - # Multiple keys - return np.bitwise_or.reduce(codes, axis=1) +# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from +# ObjectStringArrayMixin because we want to have the object-dtype based methods as +# fallback for the ones that pyarrow doesn't yet support -def names_compat(meth: F) -> F: - """ - A decorator to allow either `name` or `names` keyword but not both. - This makes it easier to share code with base class. +class ArrowStringArray( + OpsMixin, ArrowExtensionArray, BaseStringArray, ObjectStringArrayMixin +): """ + Extension array for string data in a ``pyarrow.ChunkedArray``. - @wraps(meth) - def new_meth(self_or_cls, *args, **kwargs): - if "name" in kwargs and "names" in kwargs: - raise TypeError("Can only provide one of `names` and `name`") - elif "name" in kwargs: - kwargs["names"] = kwargs.pop("name") - - return meth(self_or_cls, *args, **kwargs) + .. versionadded:: 1.2.0 - return cast(F, new_meth) + .. warning:: - -class MultiIndex(Index): - """ - A multi-level, or hierarchical, index object for pandas objects. + ArrowStringArray is considered experimental. The implementation and + parts of the API may change without warning. Parameters ---------- - levels : sequence of arrays - The unique labels for each level. - codes : sequence of arrays - Integers for each level designating which label at each location. - sortorder : optional int - Level of sortedness (must be lexicographically sorted by that - level). - names : optional sequence of objects - Names for each of the index levels. (name is accepted for compat). - copy : bool, default False - Copy the meta-data. - verify_integrity : bool, default True - Check that the levels/codes are consistent and valid. + values : pyarrow.Array or pyarrow.ChunkedArray + The array of data. Attributes ---------- - names - levels - codes - nlevels - levshape + None Methods ------- - from_arrays - from_tuples - from_product - from_frame - set_levels - set_codes - to_frame - to_flat_index - sortlevel - droplevel - swaplevel - reorder_levels - remove_unused_levels - get_locs + None See Also -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex. - MultiIndex.from_product : Create a MultiIndex from the cartesian product - of iterables. - MultiIndex.from_tuples : Convert list of tuples to a MultiIndex. - MultiIndex.from_frame : Make a MultiIndex from a DataFrame. - Index : The base pandas Index type. + array + The recommended function for creating a ArrowStringArray. + Series.str + The string methods are available on Series backed by + a ArrowStringArray. Notes ----- - See the `user guide - `__ - for more. + ArrowStringArray returns a BooleanArray for comparison methods. Examples -------- - A new ``MultiIndex`` is typically constructed using one of the helper - methods :meth:`MultiIndex.from_arrays`, :meth:`MultiIndex.from_product` - and :meth:`MultiIndex.from_tuples`. For example (using ``.from_arrays``): - - >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) - MultiIndex([(1, 'red'), - (1, 'blue'), - (2, 'red'), - (2, 'blue')], - names=['number', 'color']) - - See further examples for how to construct a MultiIndex in the doc strings - of the mentioned helper methods. + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]") + + ['This is', 'some text', , 'data.'] + Length: 4, dtype: string """ - _hidden_attrs = Index._hidden_attrs | frozenset() - - # initialize to zero-length tuples to make everything work - _typ = "multiindex" - _names = FrozenList() - _levels = FrozenList() - _codes = FrozenList() - _comparables = ["names"] - - sortorder: int | None - - # -------------------------------------------------------------------- - # Constructors - - def __new__( - cls, - levels=None, - codes=None, - sortorder=None, - names=None, - dtype=None, - copy=False, - name=None, - verify_integrity: bool = True, - ): - - # compat with Index - if name is not None: - names = name - if levels is None or codes is None: - raise TypeError("Must pass both levels and codes") - if len(levels) != len(codes): - raise ValueError("Length of levels and codes must be the same.") - if len(levels) == 0: - raise ValueError("Must pass non-zero number of levels/codes") - - result = object.__new__(cls) - result._cache = {} - - # we've already validated levels and codes, so shortcut here - result._set_levels(levels, copy=copy, validate=False) - result._set_codes(codes, copy=copy, validate=False) - - # Incompatible types in assignment (expression has type "List[None]", - # variable has type "FrozenList") [assignment] - result._names = [None] * len(levels) # type: ignore[assignment] - if names is not None: - # handles name validation - result._set_names(names) - - if sortorder is not None: - result.sortorder = int(sortorder) + def __init__(self, values) -> None: + self._dtype = StringDtype(storage="pyarrow") + if isinstance(values, pa.Array): + self._data = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): + self._data = values else: - result.sortorder = sortorder - - if verify_integrity: - new_codes = result._verify_integrity() - result._codes = new_codes - result._reset_identity() - return result - - def _validate_codes(self, level: list, code: list): - """ - Reassign code values as -1 if their corresponding levels are NaN. - - Parameters - ---------- - code : list - Code to reassign. - level : list - Level to check for missing values (NaN, NaT, None). - - Returns - ------- - new code where code value = -1 if it corresponds - to a level with missing values (NaN, NaT, None). - """ - null_mask = isna(level) - if np.any(null_mask): - # Incompatible types in assignment (expression has type - # "ndarray[Any, dtype[Any]]", variable has type "List[Any]") - code = np.where(null_mask[code], -1, code) # type: ignore[assignment] - return code - - def _verify_integrity(self, codes: list | None = None, levels: list | None = None): - """ - Parameters - ---------- - codes : optional list - Codes to check for validity. Defaults to current codes. - levels : optional list - Levels to check for validity. Defaults to current levels. - - Raises - ------ - ValueError - If length of levels and codes don't match, if the codes for any - level would exceed level bounds, or there are any duplicate levels. - - Returns - ------- - new codes where code value = -1 if it corresponds to a - NaN level. - """ - # NOTE: Currently does not check, among other things, that cached - # nlevels matches nor that sortorder matches actually sortorder. - codes = codes or self.codes - levels = levels or self.levels + raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") - if len(levels) != len(codes): + if not pa.types.is_string(self._data.type): raise ValueError( - "Length of levels and codes must match. NOTE: " - "this index is in an inconsistent state." + "ArrowStringArray requires a PyArrow (chunked) array of string type" ) - codes_length = len(codes[0]) - for i, (level, level_codes) in enumerate(zip(levels, codes)): - if len(level_codes) != codes_length: - raise ValueError( - f"Unequal code lengths: {[len(code_) for code_ in codes]}" - ) - if len(level_codes) and level_codes.max() >= len(level): - raise ValueError( - f"On level {i}, code max ({level_codes.max()}) >= length of " - f"level ({len(level)}). NOTE: this index is in an " - "inconsistent state" - ) - if len(level_codes) and level_codes.min() < -1: - raise ValueError(f"On level {i}, code value ({level_codes.min()}) < -1") - if not level.is_unique: - raise ValueError( - f"Level values must be unique: {list(level)} on level {i}" - ) - if self.sortorder is not None: - if self.sortorder > _lexsort_depth(self.codes, self.nlevels): - raise ValueError( - "Value for sortorder must be inferior or equal to actual " - f"lexsort_depth: sortorder {self.sortorder} " - f"with lexsort_depth {_lexsort_depth(self.codes, self.nlevels)}" - ) - - codes = [ - self._validate_codes(level, code) for level, code in zip(levels, codes) - ] - new_codes = FrozenList(codes) - return new_codes @classmethod - def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> MultiIndex: - """ - Convert arrays to MultiIndex. + def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): + from pandas.core.arrays.masked import BaseMaskedArray - Parameters - ---------- - arrays : list / sequence of array-likes - Each array-like gives one level's value for each data point. - len(arrays) is the number of levels. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level). - names : list / sequence of str, optional - Names for the levels in the index. + _chk_pyarrow_available() - Returns - ------- - MultiIndex + if dtype and not (isinstance(dtype, str) and dtype == "string"): + dtype = pandas_dtype(dtype) + assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" - See Also - -------- - MultiIndex.from_tuples : Convert list of tuples to MultiIndex. - MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables. - MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + if isinstance(scalars, BaseMaskedArray): + # avoid costly conversion to object dtype in ensure_string_array and + # numerical issues with Float32Dtype + na_values = scalars._mask + result = scalars._data + result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + return cls(pa.array(result, mask=na_values, type=pa.string())) - Examples - -------- - >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) - MultiIndex([(1, 'red'), - (1, 'blue'), - (2, 'red'), - (2, 'blue')], - names=['number', 'color']) - """ - error_msg = "Input must be a list / sequence of array-likes." - if not is_list_like(arrays): - raise TypeError(error_msg) - elif is_iterator(arrays): - arrays = list(arrays) - - # Check if elements of array are list-like - for array in arrays: - if not is_list_like(array): - raise TypeError(error_msg) - - # Check if lengths of all arrays are equal or not, - # raise ValueError, if not - for i in range(1, len(arrays)): - if len(arrays[i]) != len(arrays[i - 1]): - raise ValueError("all arrays must be same length") - - codes, levels = factorize_from_iterables(arrays) - - if all(isinstance(e, tuple) for e in arrays): - codes = [np.array([i for i in range(len(arrays))])] - _dtype_obj = np.dtype("object") - subarr = com.asarray_tuplesafe(arrays, dtype=_dtype_obj) - levels = [Index(subarr)] - - if names is lib.no_default: - names = [getattr(arr, "name", None) for arr in arrays] - - return cls( - levels=levels, - codes=codes, - sortorder=sortorder, - names=names, - verify_integrity=False, - ) + # convert non-na-likes to str + result = lib.ensure_string_array(scalars, copy=copy) + return cls(pa.array(result, type=pa.string(), from_pandas=True)) @classmethod - @names_compat - def from_tuples( - cls, - tuples: Iterable[tuple[Hashable, ...]], - sortorder: int | None = None, - names: Sequence[Hashable] | None = None, - ) -> MultiIndex: + def _from_sequence_of_strings( + cls, strings, dtype: Dtype | None = None, copy: bool = False + ): + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + @property + def dtype(self) -> StringDtype: + """ + An instance of 'string[pyarrow]'. """ - Convert list of tuples to MultiIndex. + return self._dtype - Parameters - ---------- - tuples : list / sequence of tuple-likes - Each tuple is the index of one row/column. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level). - names : list / sequence of str, optional - Names for the levels in the index. + def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + """Correctly construct numpy arrays when passed to `np.asarray()`.""" + return self.to_numpy(dtype=dtype) - Returns - ------- - MultiIndex + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value=lib.no_default, + ) -> np.ndarray: + """ + Convert to a NumPy ndarray. + """ + # TODO: copy argument is ignored - See Also - -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex. - MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables. - MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + result = np.array(self._data, dtype=dtype) + if self._data.null_count > 0: + if na_value is lib.no_default: + if dtype and np.issubdtype(dtype, np.floating): + return result + na_value = self._dtype.na_value + mask = self.isna() + result[mask] = na_value + return result - Examples - -------- - >>> tuples = [(1, 'red'), (1, 'blue'), - ... (2, 'red'), (2, 'blue')] - >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) - MultiIndex([(1, 'red'), - (1, 'blue'), - (2, 'red'), - (2, 'blue')], - names=['number', 'color']) - """ - if not is_list_like(tuples): - raise TypeError("Input must be a list / sequence of tuple-likes.") - elif is_iterator(tuples): - tuples = list(tuples) - tuples = cast(Collection[Tuple[Hashable, ...]], tuples) - - arrays: list[Sequence[Hashable]] - if len(tuples) == 0: - if names is None: - raise TypeError("Cannot infer number of levels from empty list") - arrays = [[]] * len(names) - elif isinstance(tuples, (np.ndarray, Index)): - if isinstance(tuples, Index): - tuples = np.asarray(tuples._values) - - arrays = list(lib.tuples_to_object_array(tuples).T) - elif isinstance(tuples, list): - arrays = list(lib.to_object_array_tuples(tuples).T) + @doc(ExtensionArray.factorize) + def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + encoded = self._data.dictionary_encode() + indices = pa.chunked_array( + [c.indices for c in encoded.chunks], type=encoded.type.index_type + ).to_pandas() + if indices.dtype.kind == "f": + indices[np.isnan(indices)] = na_sentinel + indices = indices.astype(np.int64, copy=False) + + if encoded.num_chunks: + uniques = type(self)(encoded.chunk(0).dictionary) else: - arrs = zip(*tuples) - arrays = cast(List[Sequence[Hashable]], arrs) + uniques = type(self)(pa.array([], type=encoded.type.value_type)) - if all(isinstance(e, tuple) for e in tuples): - if not all(tuples): - return cls.from_arrays(tuples, sortorder=sortorder, names=names) + return indices.values, uniques - return cls.from_arrays(arrays, sortorder=sortorder, names=names) + @overload + def __getitem__(self, item: ScalarIndexer) -> ArrowStringScalarOrNAT: + ... - @classmethod - def from_product( - cls, iterables, sortorder=None, names=lib.no_default - ) -> MultiIndex: - """ - Make a MultiIndex from the cartesian product of multiple iterables. + @overload + def __getitem__(self: ArrowStringArray, item: SequenceIndexer) -> ArrowStringArray: + ... + + def __getitem__( + self: ArrowStringArray, item: PositionalIndexer + ) -> ArrowStringArray | ArrowStringScalarOrNAT: + """Select a subset of self. Parameters ---------- - iterables : list / sequence of iterables - Each iterable has unique labels for each level of the index. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level). - names : list / sequence of str, optional - Names for the levels in the index. - - .. versionchanged:: 1.0.0 - - If not explicitly provided, names will be inferred from the - elements of iterables if an element has a name attribute + item : int, slice, or ndarray + * int: The position in 'self' to get. + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' Returns ------- - MultiIndex + item : scalar or ExtensionArray - See Also - -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex. - MultiIndex.from_tuples : Convert list of tuples to MultiIndex. - MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + item = check_array_indexer(self, item) + + if isinstance(item, np.ndarray): + if not len(item): + return type(self)(pa.chunked_array([], type=pa.string())) + elif is_integer_dtype(item.dtype): + return self.take(item) + elif is_bool_dtype(item.dtype): + return type(self)(self._data.filter(item)) + else: + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) + elif isinstance(item, tuple): + item = unpack_tuple_and_ellipses(item) + + # error: Non-overlapping identity check (left operand type: + # "Union[Union[int, integer[Any]], Union[slice, List[int], + # ndarray[Any, Any]]]", right operand type: "ellipsis") + if item is Ellipsis: # type: ignore[comparison-overlap] + # TODO: should be handled by pyarrow? + item = slice(None) + + if is_scalar(item) and not is_integer(item): + # e.g. "foo" or 2.5 + # exception message copied from numpy + raise IndexError( + r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " + r"(`None`) and integer or boolean arrays are valid indices" + ) + # We are not an array indexer, so maybe e.g. a slice or integer + # indexer. We dispatch to pyarrow. + value = self._data[item] + if isinstance(value, pa.ChunkedArray): + return type(self)(value) + else: + return self._as_pandas_scalar(value) - Examples - -------- - >>> numbers = [0, 1, 2] - >>> colors = ['green', 'purple'] - >>> pd.MultiIndex.from_product([numbers, colors], - ... names=['number', 'color']) - MultiIndex([(0, 'green'), - (0, 'purple'), - (1, 'green'), - (1, 'purple'), - (2, 'green'), - (2, 'purple')], - names=['number', 'color']) - """ - from pandas.core.reshape.util import cartesian_product + def _as_pandas_scalar(self, arrow_scalar: pa.Scalar): + scalar = arrow_scalar.as_py() + if scalar is None: + return self._dtype.na_value + else: + return scalar - if not is_list_like(iterables): - raise TypeError("Input must be a list / sequence of iterables.") - elif is_iterator(iterables): - iterables = list(iterables) + def _cmp_method(self, other, op): + from pandas.arrays import BooleanArray - codes, levels = factorize_from_iterables(iterables) - if names is lib.no_default: - names = [getattr(it, "name", None) for it in iterables] + pc_func = ARROW_CMP_FUNCS[op.__name__] + if isinstance(other, ArrowStringArray): + result = pc_func(self._data, other._data) + elif isinstance(other, (np.ndarray, list)): + result = pc_func(self._data, other) + elif is_scalar(other): + try: + result = pc_func(self._data, pa.scalar(other)) + except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): + mask = isna(self) | isna(other) + valid = ~mask + result = np.zeros(len(self), dtype="bool") + result[valid] = op(np.array(self)[valid], other) + return BooleanArray(result, mask) + else: + return NotImplemented - # codes are all ndarrays, so cartesian_product is lossless - codes = cartesian_product(codes) - return cls(levels, codes, sortorder=sortorder, names=names) + # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray + return BooleanArray._from_sequence(result.to_pandas().values) - @classmethod - def from_frame(cls, df: DataFrame, sortorder=None, names=None) -> MultiIndex: - """ - Make a MultiIndex from a DataFrame. + def insert(self, loc: int, item): + if not isinstance(item, str) and item is not libmissing.NA: + raise TypeError("Scalar must be NA or str") + return super().insert(loc, item) + + def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: + """Set one or more values inplace. Parameters ---------- - df : DataFrame - DataFrame to be converted to MultiIndex. - sortorder : int, optional - Level of sortedness (must be lexicographically sorted by that - level). - names : list-like, optional - If no names are provided, use the column names, or tuple of column - names if the columns is a MultiIndex. If a sequence, overwrite - names with the given sequence. - - Returns - ------- - MultiIndex - The MultiIndex representation of the given DataFrame. + key : int, ndarray, or slice + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of - See Also - -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex. - MultiIndex.from_tuples : Convert list of tuples to MultiIndex. - MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables. - - Examples - -------- - >>> df = pd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], - ... ['NJ', 'Temp'], ['NJ', 'Precip']], - ... columns=['a', 'b']) - >>> df - a b - 0 HI Temp - 1 HI Precip - 2 NJ Temp - 3 NJ Precip - - >>> pd.MultiIndex.from_frame(df) - MultiIndex([('HI', 'Temp'), - ('HI', 'Precip'), - ('NJ', 'Temp'), - ('NJ', 'Precip')], - names=['a', 'b']) - - Using explicit names, instead of the column names - - >>> pd.MultiIndex.from_frame(df, names=['state', 'observation']) - MultiIndex([('HI', 'Temp'), - ('HI', 'Precip'), - ('NJ', 'Temp'), - ('NJ', 'Precip')], - names=['state', 'observation']) - """ - if not isinstance(df, ABCDataFrame): - raise TypeError("Input must be a DataFrame") - - column_names, columns = zip(*df.items()) - names = column_names if names is None else names - return cls.from_arrays(columns, sortorder=sortorder, names=names) - - # -------------------------------------------------------------------- - - @cache_readonly - def _values(self) -> np.ndarray: - # We override here, since our parent uses _data, which we don't use. - values = [] - - for i in range(self.nlevels): - vals = self._get_level_values(i) - if is_categorical_dtype(vals.dtype): - vals = cast("CategoricalIndex", vals) - vals = vals._data._internal_get_values() - if isinstance(vals.dtype, ExtensionDtype) or isinstance( - vals, (ABCDatetimeIndex, ABCTimedeltaIndex) - ): - vals = vals.astype(object) - # error: Incompatible types in assignment (expression has type "ndarray", - # variable has type "Index") - vals = np.array(vals, copy=False) # type: ignore[assignment] - values.append(vals) - - arr = lib.fast_zip(values) - return arr + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object - @property - def values(self) -> np.ndarray: - return self._values + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. - @property - def array(self): + Returns + ------- + None """ - Raises a ValueError for `MultiIndex` because there's no single - array backing a MultiIndex. + key = check_array_indexer(self, key) - Raises - ------ - ValueError - """ - raise ValueError( - "MultiIndex has no single backing array. Use " - "'MultiIndex.to_numpy()' to get a NumPy array of tuples." - ) + if is_integer(key): + key = cast(int, key) - @cache_readonly - def dtypes(self) -> Series: - """ - Return the dtypes as a Series for the underlying MultiIndex. - """ - from pandas import Series + if not is_scalar(value): + raise ValueError("Must pass scalars with scalar indexer") + elif isna(value): + value = None + elif not isinstance(value, str): + raise ValueError("Scalar must be NA or str") - names = com.fill_missing_names([level.name for level in self.levels]) - return Series([level.dtype for level in self.levels], index=names) + # Slice data and insert in-between + new_data = [ + *self._data[0:key].chunks, + pa.array([value], type=pa.string()), + *self._data[(key + 1) :].chunks, + ] + self._data = pa.chunked_array(new_data) + else: + # Convert to integer indices and iteratively assign. + # TODO: Make a faster variant of this in Arrow upstream. + # This is probably extremely slow. + + # Convert all possible input key types to an array of integers + if isinstance(key, slice): + key_array = np.array(range(len(self))[key]) + elif is_bool_dtype(key): + # TODO(ARROW-9430): Directly support setitem(booleans) + key_array = np.argwhere(key).flatten() + else: + # TODO(ARROW-9431): Directly support setitem(integers) + key_array = np.asanyarray(key) - def __len__(self) -> int: - return len(self.codes[0]) + if is_scalar(value): + value = np.broadcast_to(value, len(key_array)) + else: + value = np.asarray(value) - # -------------------------------------------------------------------- - # Levels Methods + if len(key_array) != len(value): + raise ValueError("Length of indexer and values mismatch") - @cache_readonly - def levels(self) -> FrozenList: - # Use cache_readonly to ensure that self.get_locs doesn't repeatedly - # create new IndexEngine - # https://github.com/pandas-dev/pandas/issues/31648 - result = [x._rename(name=name) for x, name in zip(self._levels, self._names)] - for level in result: - # disallow midx.levels[0].name = "foo" - level._no_setting_name = True - return FrozenList(result) + for k, v in zip(key_array, value): + self[k] = v - def _set_levels( + def take( self, - levels, - *, - level=None, - copy: bool = False, - validate: bool = True, - verify_integrity: bool = False, - ) -> None: - # This is NOT part of the levels property because it should be - # externally not allowed to set levels. User beware if you change - # _levels directly - if validate: - if len(levels) == 0: - raise ValueError("Must set non-zero number of levels.") - if level is None and len(levels) != self.nlevels: - raise ValueError("Length of levels must match number of levels.") - if level is not None and len(levels) != len(level): - raise ValueError("Length of levels must match length of level.") - - if level is None: - new_levels = FrozenList( - ensure_index(lev, copy=copy)._view() for lev in levels - ) - else: - level_numbers = [self._get_level_number(lev) for lev in level] - new_levels_list = list(self._levels) - for lev_num, lev in zip(level_numbers, levels): - new_levels_list[lev_num] = ensure_index(lev, copy=copy)._view() - new_levels = FrozenList(new_levels_list) - - if verify_integrity: - new_codes = self._verify_integrity(levels=new_levels) - self._codes = new_codes - - names = self.names - self._levels = new_levels - if any(names): - self._set_names(names) - - self._reset_cache() - - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "levels"]) - def set_levels( - self, levels, level=None, inplace=None, verify_integrity: bool = True + indices: TakeIndexer, + allow_fill: bool = False, + fill_value: Any = None, ): """ - Set new levels on MultiIndex. Defaults to returning new index. + Take elements from an array. Parameters ---------- - levels : sequence or list of sequence - New level(s) to apply. - level : int, level name, or sequence of int/level names (default None) - Level(s) to set (None for all levels). - inplace : bool - If True, mutates in place. - - .. deprecated:: 1.2.0 - verify_integrity : bool, default True - If True, checks that levels and codes are compatible. + indices : sequence of int or one-dimensional np.ndarray of int + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. - Returns - ------- - new index (of same type and class...etc) or None - The same type as the caller or None if ``inplace=True``. - - Examples - -------- - >>> idx = pd.MultiIndex.from_tuples( - ... [ - ... (1, "one"), - ... (1, "two"), - ... (2, "one"), - ... (2, "two"), - ... (3, "one"), - ... (3, "two") - ... ], - ... names=["foo", "bar"] - ... ) - >>> idx - MultiIndex([(1, 'one'), - (1, 'two'), - (2, 'one'), - (2, 'two'), - (3, 'one'), - (3, 'two')], - names=['foo', 'bar']) - - >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) - MultiIndex([('a', 1), - ('a', 2), - ('b', 1), - ('b', 2), - ('c', 1), - ('c', 2)], - names=['foo', 'bar']) - >>> idx.set_levels(['a', 'b', 'c'], level=0) - MultiIndex([('a', 'one'), - ('a', 'two'), - ('b', 'one'), - ('b', 'two'), - ('c', 'one'), - ('c', 'two')], - names=['foo', 'bar']) - >>> idx.set_levels(['a', 'b'], level='bar') - MultiIndex([(1, 'a'), - (1, 'b'), - (2, 'a'), - (2, 'b'), - (3, 'a'), - (3, 'b')], - names=['foo', 'bar']) - - If any of the levels passed to ``set_levels()`` exceeds the - existing length, all of the values from that argument will - be stored in the MultiIndex levels, though the values will - be truncated in the MultiIndex output. - - >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]) - MultiIndex([('a', 1), - ('a', 2), - ('b', 1), - ('b', 2), - ('c', 1), - ('c', 2)], - names=['foo', 'bar']) - >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels - FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) - """ - if inplace is not None: - warnings.warn( - "inplace is deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - inplace = False - - if is_list_like(levels) and not isinstance(levels, Index): - levels = list(levels) + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. - level, levels = _require_listlike(level, levels, "Levels") + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. - if inplace: - idx = self - else: - idx = self._view() - idx._reset_identity() - idx._set_levels( - levels, level=level, validate=True, verify_integrity=verify_integrity - ) - if not inplace: - return idx + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. - @property - def nlevels(self) -> int: - """ - Integer number of levels in this MultiIndex. + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) - >>> mi - MultiIndex([('a', 'b', 'c')], - ) - >>> mi.nlevels - 3 - """ - return len(self._levels) + Returns + ------- + ExtensionArray - @property - def levshape(self) -> Shape: - """ - A tuple with the length of each level. + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. - Examples + See Also -------- - >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) - >>> mi - MultiIndex([('a', 'b', 'c')], - ) - >>> mi.levshape - (1, 1, 1) - """ - return tuple(len(x) for x in self.levels) + numpy.take + api.extensions.take - # -------------------------------------------------------------------- - # Codes Methods + Notes + ----- + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignment, with a `fill_value`. + """ + # TODO: Remove once we got rid of the (indices < 0) check + if not is_array_like(indices): + indices_array = np.asanyarray(indices) + else: + # error: Incompatible types in assignment (expression has type + # "Sequence[int]", variable has type "ndarray") + indices_array = indices # type: ignore[assignment] - @property - def codes(self): - return self._codes + if len(self._data) == 0 and (indices_array >= 0).any(): + raise IndexError("cannot do a non-empty take") + if indices_array.size > 0 and indices_array.max() >= len(self._data): + raise IndexError("out of bounds value in 'indices'.") - def _set_codes( - self, - codes, - *, - level=None, - copy: bool = False, - validate: bool = True, - verify_integrity: bool = False, - ) -> None: - if validate: - if level is None and len(codes) != self.nlevels: - raise ValueError("Length of codes must match number of levels") - if level is not None and len(codes) != len(level): - raise ValueError("Length of codes must match length of levels.") - - if level is None: - new_codes = FrozenList( - _coerce_indexer_frozen(level_codes, lev, copy=copy).view() - for lev, level_codes in zip(self._levels, codes) - ) - else: - level_numbers = [self._get_level_number(lev) for lev in level] - new_codes_list = list(self._codes) - for lev_num, level_codes in zip(level_numbers, codes): - lev = self.levels[lev_num] - new_codes_list[lev_num] = _coerce_indexer_frozen( - level_codes, lev, copy=copy - ) - new_codes = FrozenList(new_codes_list) + if allow_fill: + fill_mask = indices_array < 0 + if fill_mask.any(): + validate_indices(indices_array, len(self._data)) + # TODO(ARROW-9433): Treat negative indices as NULL + indices_array = pa.array(indices_array, mask=fill_mask) + result = self._data.take(indices_array) + if isna(fill_value): + return type(self)(result) + # TODO: ArrowNotImplementedError: Function fill_null has no + # kernel matching input types (array[string], scalar[string]) + result = type(self)(result) + result[fill_mask] = fill_value + return result + # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) + else: + # Nothing to fill + return type(self)(self._data.take(indices)) + else: # allow_fill=False + # TODO(ARROW-9432): Treat negative indices as indices from the right. + if (indices_array < 0).any(): + # Don't modify in-place + indices_array = np.copy(indices_array) + indices_array[indices_array < 0] += len(self._data) + return type(self)(self._data.take(indices_array)) + + def isin(self, values): + if pa_version_under2p0: + return super().isin(values) + + value_set = [ + pa_scalar.as_py() + for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] + if pa_scalar.type in (pa.string(), pa.null()) + ] - if verify_integrity: - new_codes = self._verify_integrity(codes=new_codes) + # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True + # for null values, so we short-circuit to return all False array. + if not len(value_set): + return np.zeros(len(self), dtype=bool) - self._codes = new_codes + kwargs = {} + if pa_version_under3p0: + # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises + # with unexpected keyword argument in pyarrow 3.0.0+ + kwargs["skip_null"] = True - self._reset_cache() + result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs) + # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls + # to False + return np.array(result, dtype=np.bool_) - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "codes"]) - def set_codes(self, codes, level=None, inplace=None, verify_integrity: bool = True): + def value_counts(self, dropna: bool = True) -> Series: """ - Set new codes on MultiIndex. Defaults to returning new index. + Return a Series containing counts of each unique value. Parameters ---------- - codes : sequence or list of sequence - New codes to apply. - level : int, level name, or sequence of int/level names (default None) - Level(s) to set (None for all levels). - inplace : bool - If True, mutates in place. - - .. deprecated:: 1.2.0 - verify_integrity : bool, default True - If True, checks that levels and codes are compatible. + dropna : bool, default True + Don't include counts of missing values. Returns ------- - new index (of same type and class...etc) or None - The same type as the caller or None if ``inplace=True``. + counts : Series - Examples + See Also -------- - >>> idx = pd.MultiIndex.from_tuples( - ... [(1, "one"), (1, "two"), (2, "one"), (2, "two")], names=["foo", "bar"] - ... ) - >>> idx - MultiIndex([(1, 'one'), - (1, 'two'), - (2, 'one'), - (2, 'two')], - names=['foo', 'bar']) - - >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]]) - MultiIndex([(2, 'one'), - (1, 'one'), - (2, 'two'), - (1, 'two')], - names=['foo', 'bar']) - >>> idx.set_codes([1, 0, 1, 0], level=0) - MultiIndex([(2, 'one'), - (1, 'two'), - (2, 'one'), - (1, 'two')], - names=['foo', 'bar']) - >>> idx.set_codes([0, 0, 1, 1], level='bar') - MultiIndex([(1, 'one'), - (1, 'one'), - (2, 'two'), - (2, 'two')], - names=['foo', 'bar']) - >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]], level=[0, 1]) - MultiIndex([(2, 'one'), - (1, 'one'), - (2, 'two'), - (1, 'two')], - names=['foo', 'bar']) + Series.value_counts """ - if inplace is not None: - warnings.warn( - "inplace is deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - inplace = False - - level, codes = _require_listlike(level, codes, "Codes") - - if inplace: - idx = self - else: - idx = self._view() - idx._reset_identity() - idx._set_codes(codes, level=level, verify_integrity=verify_integrity) - if not inplace: - return idx - - # -------------------------------------------------------------------- - # Index Internals - - @cache_readonly - def _engine(self): - # Calculate the number of bits needed to represent labels in each - # level, as log2 of their sizes (including -1 for NaN): - sizes = np.ceil(np.log2([len(level) + 1 for level in self.levels])) - - # Sum bit counts, starting from the _right_.... - lev_bits = np.cumsum(sizes[::-1])[::-1] - - # ... in order to obtain offsets such that sorting the combination of - # shifted codes (one for each level, resulting in a unique integer) is - # equivalent to sorting lexicographically the codes themselves. Notice - # that each level needs to be shifted by the number of bits needed to - # represent the _previous_ ones: - offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64") - - # Check the total number of bits needed for our representation: - if lev_bits[0] > 64: - # The levels would overflow a 64 bit uint - use Python integers: - return MultiIndexPyIntEngine(self.levels, self.codes, offsets) - return MultiIndexUIntEngine(self.levels, self.codes, offsets) - - # Return type "Callable[..., MultiIndex]" of "_constructor" incompatible with return - # type "Type[MultiIndex]" in supertype "Index" - @property - def _constructor(self) -> Callable[..., MultiIndex]: # type: ignore[override] - return type(self).from_tuples - - @doc(Index._shallow_copy) - def _shallow_copy(self, values: np.ndarray, name=lib.no_default) -> MultiIndex: - names = name if name is not lib.no_default else self.names - - return type(self).from_tuples(values, sortorder=None, names=names) - - def _view(self) -> MultiIndex: - result = type(self)( - levels=self.levels, - codes=self.codes, - sortorder=self.sortorder, - names=self.names, - verify_integrity=False, + from pandas import ( + Index, + Series, ) - result._cache = self._cache.copy() - result._cache.pop("levels", None) # GH32669 - return result - # -------------------------------------------------------------------- + vc = self._data.value_counts() - def copy( - self, - names=None, - dtype=None, - levels=None, - codes=None, - deep=False, - name=None, - ): - """ - Make a copy of this object. Names, dtype, levels and codes can be - passed and will be set on new copy. + values = vc.field(0) + counts = vc.field(1) + if dropna and self._data.null_count > 0: + mask = values.is_valid() + values = values.filter(mask) + counts = counts.filter(mask) - Parameters - ---------- - names : sequence, optional - dtype : numpy dtype or pandas type, optional + # No missing values so we can adhere to the interface and return a numpy array. + counts = np.array(counts) - .. deprecated:: 1.2.0 - levels : sequence, optional + index = Index(type(self)(values)) - .. deprecated:: 1.2.0 - codes : sequence, optional + return Series(counts, index=index).astype("Int64") - .. deprecated:: 1.2.0 - deep : bool, default False - name : Label - Kept for compatibility with 1-dimensional Index. Should not be used. + def astype(self, dtype, copy: bool = True): + dtype = pandas_dtype(dtype) - Returns - ------- - MultiIndex + if is_dtype_equal(dtype, self.dtype): + if copy: + return self.copy() + return self - Notes - ----- - In most cases, there should be no functional difference from using - ``deep``, but if ``deep`` is passed it will attempt to deepcopy. - This could be potentially expensive on large MultiIndex objects. - """ - names = self._validate_names(name=name, names=names, deep=deep) - if levels is not None: - warnings.warn( - "parameter levels is deprecated and will be removed in a future " - "version. Use the set_levels method instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if codes is not None: - warnings.warn( - "parameter codes is deprecated and will be removed in a future " - "version. Use the set_codes method instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) + elif isinstance(dtype, NumericDtype): + data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype)) + return dtype.__from_arrow__(data) - if deep: - from copy import deepcopy + return super().astype(dtype, copy=copy) - if levels is None: - levels = deepcopy(self.levels) - if codes is None: - codes = deepcopy(self.codes) + # ------------------------------------------------------------------------ + # String methods interface - levels = levels if levels is not None else self.levels - codes = codes if codes is not None else self.codes + # error: Cannot determine type of 'na_value' + _str_na_value = StringDtype.na_value # type: ignore[has-type] - new_index = type(self)( - levels=levels, - codes=codes, - sortorder=self.sortorder, - names=names, - verify_integrity=False, - ) - new_index._cache = self._cache.copy() - new_index._cache.pop("levels", None) # GH32669 - - if dtype: - warnings.warn( - "parameter dtype is deprecated and will be removed in a future " - "version. Use the astype method instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - new_index = new_index.astype(dtype) - return new_index + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + # TODO: de-duplicate with StringArray method. This method is moreless copy and + # paste. - def __array__(self, dtype=None) -> np.ndarray: - """the array interface, return my values""" - return self.values + from pandas.arrays import ( + BooleanArray, + IntegerArray, + ) - def view(self, cls=None): - """this is defined as a copy with the same identity""" - result = self.copy() - result._id = self._id - return result + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value - @doc(Index.__contains__) - def __contains__(self, key: Any) -> bool: - hash(key) - try: - self.get_loc(key) - return True - except (LookupError, TypeError, ValueError): - return False + mask = isna(self) + arr = np.asarray(self) - @cache_readonly - def dtype(self) -> np.dtype: - return np.dtype("O") + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: type[IntegerArray] | type[BooleanArray] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + # error: Argument 1 to "dtype" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected + # "Type[object]" + dtype=np.dtype(dtype), # type: ignore[arg-type] + ) - def _is_memory_usage_qualified(self) -> bool: - """return a boolean if we need a qualified .info display""" + if not na_value_is_na: + mask[:] = False - def f(level): - return "mixed" in level or "string" in level or "unicode" in level + return constructor(result, mask) - return any(f(level) for level in self._inferred_type_levels) + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) + return type(self)(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) + + def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): + if flags: + return super()._str_contains(pat, case, flags, na, regex) + + if regex: + if pa_version_under4p0 or case is False: + return super()._str_contains(pat, case, flags, na, regex) + else: + result = pc.match_substring_regex(self._data, pat) + else: + if case: + result = pc.match_substring(self._data, pat) + else: + result = pc.match_substring(pc.utf8_upper(self._data), pat.upper()) + result = BooleanDtype().__from_arrow__(result) + if not isna(na): + result[isna(result)] = bool(na) + return result - @doc(Index.memory_usage) - def memory_usage(self, deep: bool = False) -> int: - # we are overwriting our base class to avoid - # computing .values here which could materialize - # a tuple representation unnecessarily - return self._nbytes(deep) + def _str_startswith(self, pat: str, na=None): + if pa_version_under4p0: + return super()._str_startswith(pat, na) - @cache_readonly - def nbytes(self) -> int: - """return the number of bytes in the underlying data""" - return self._nbytes(False) + pat = "^" + re.escape(pat) + return self._str_contains(pat, na=na, regex=True) - def _nbytes(self, deep: bool = False) -> int: - """ - return the number of bytes in the underlying data - deeply introspect the level data if deep=True + def _str_endswith(self, pat: str, na=None): + if pa_version_under4p0: + return super()._str_endswith(pat, na) - include the engine hashtable + pat = re.escape(pat) + "$" + return self._str_contains(pat, na=na, regex=True) - *this is in internal routine* + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ): + if ( + pa_version_under4p0 + or isinstance(pat, re.Pattern) + or callable(repl) + or not case + or flags + ): + return super()._str_replace(pat, repl, n, case, flags, regex) - """ - # for implementations with no useful getsizeof (PyPy) - objsize = 24 + func = pc.replace_substring_regex if regex else pc.replace_substring + result = func(self._data, pattern=pat, replacement=repl, max_replacements=n) + return type(self)(result) - level_nbytes = sum(i.memory_usage(deep=deep) for i in self.levels) - label_nbytes = sum(i.nbytes for i in self.codes) - names_nbytes = sum(getsizeof(i, objsize) for i in self.names) - result = level_nbytes + label_nbytes + names_nbytes + def _str_match( + self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if pa_version_under4p0: + return super()._str_match(pat, case, flags, na) - # include our engine hashtable - result += self._engine.sizeof(deep=deep) - return result + if not pat.startswith("^"): + pat = "^" + pat + return self._str_contains(pat, case, flags, na, regex=True) - # -------------------------------------------------------------------- - # Rendering Methods + def _str_fullmatch( + self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if pa_version_under4p0: + return super()._str_fullmatch(pat, case, flags, na) - def _formatter_func(self, tup): - """ - Formats each item in tup according to its level's formatter function. - """ - formatter_funcs = [level._formatter_func for level in self.levels] - return tuple(func(val) for func, val in zip(formatter_funcs, tup)) - - def _format_native_types(self, *, na_rep="nan", **kwargs): - new_levels = [] - new_codes = [] - - # go through the levels and format them - for level, level_codes in zip(self.levels, self.codes): - level_strs = level._format_native_types(na_rep=na_rep, **kwargs) - # add nan values, if there are any - mask = level_codes == -1 - if mask.any(): - nan_index = len(level_strs) - # numpy 1.21 deprecated implicit string casting - level_strs = level_strs.astype(str) - level_strs = np.append(level_strs, na_rep) - assert not level_codes.flags.writeable # i.e. copy is needed - level_codes = level_codes.copy() # make writeable - level_codes[mask] = nan_index - new_levels.append(level_strs) - new_codes.append(level_codes) - - if len(new_levels) == 1: - # a single-level multi-index - return Index(new_levels[0].take(new_codes[0]))._format_native_types() - else: - # reconstruct the multi-index - mi = MultiIndex( - levels=new_levels, - codes=new_codes, - names=self.names, - sortorder=self.sortorder, - verify_integrity=False, - ) - return mi._values + if not pat.endswith("$") or pat.endswith("//$"): + pat = pat + "$" + return self._str_match(pat, case, flags, na) - def format( - self, - name: bool | None = None, - formatter: Callable | None = None, - na_rep: str | None = None, - names: bool = False, - space: int = 2, - sparsify=None, - adjoin: bool = True, - ) -> list: - if name is not None: - names = name - - if len(self) == 0: - return [] - - stringified_levels = [] - for lev, level_codes in zip(self.levels, self.codes): - na = na_rep if na_rep is not None else _get_na_rep(lev.dtype.type) - - if len(lev) > 0: - - formatted = lev.take(level_codes).format(formatter=formatter) - - # we have some NA - mask = level_codes == -1 - if mask.any(): - formatted = np.array(formatted, dtype=object) - formatted[mask] = na - formatted = formatted.tolist() + def _str_isalnum(self): + result = pc.utf8_is_alnum(self._data) + return BooleanDtype().__from_arrow__(result) - else: - # weird all NA case - formatted = [ - pprint_thing(na if isna(x) else x, escape_chars=("\t", "\r", "\n")) - for x in algos.take_nd(lev._values, level_codes) - ] - stringified_levels.append(formatted) - - result_levels = [] - for lev, lev_name in zip(stringified_levels, self.names): - level = [] - - if names: - level.append( - pprint_thing(lev_name, escape_chars=("\t", "\r", "\n")) - if lev_name is not None - else "" - ) + def _str_isalpha(self): + result = pc.utf8_is_alpha(self._data) + return BooleanDtype().__from_arrow__(result) - level.extend(np.array(lev, dtype=object)) - result_levels.append(level) - - if sparsify is None: - sparsify = get_option("display.multi_sparse") - - if sparsify: - sentinel = "" - # GH3547 use value of sparsify as sentinel if it's "Falsey" - assert isinstance(sparsify, bool) or sparsify is lib.no_default - if sparsify in [False, lib.no_default]: - sentinel = sparsify - # little bit of a kludge job for #1217 - result_levels = sparsify_labels( - result_levels, start=int(names), sentinel=sentinel - ) + def _str_isdecimal(self): + result = pc.utf8_is_decimal(self._data) + return BooleanDtype().__from_arrow__(result) - if adjoin: - from pandas.io.formats.format import get_adjustment + def _str_isdigit(self): + result = pc.utf8_is_digit(self._data) + return BooleanDtype().__from_arrow__(result) - adj = get_adjustment() - return adj.adjoin(space, *result_levels).split("\n") - else: - return result_levels + def _str_islower(self): + result = pc.utf8_is_lower(self._data) + return BooleanDtype().__from_arrow__(result) - # -------------------------------------------------------------------- - # Names Methods + def _str_isnumeric(self): + result = pc.utf8_is_numeric(self._data) + return BooleanDtype().__from_arrow__(result) - def _get_names(self) -> FrozenList: - return FrozenList(self._names) + def _str_isspace(self): + if pa_version_under2p0: + return super()._str_isspace() - def _set_names(self, names, *, level=None, validate: bool = True): - """ - Set new names on index. Each name has to be a hashable type. + result = pc.utf8_is_space(self._data) + return BooleanDtype().__from_arrow__(result) - Parameters - ---------- - values : str or sequence - name(s) to set - level : int, level name, or sequence of int/level names (default None) - If the index is a MultiIndex (hierarchical), level(s) to set (None - for all levels). Otherwise level must be None - validate : bool, default True - validate that the names match level lengths + def _str_istitle(self): + result = pc.utf8_is_title(self._data) + return BooleanDtype().__from_arrow__(result) - Raises - ------ - TypeError if each name is not hashable. + def _str_isupper(self): + result = pc.utf8_is_upper(self._data) + return BooleanDtype().__from_arrow__(result) - Notes - ----- - sets names on levels. WARNING: mutates! + def _str_len(self): + if pa_version_under4p0: + return super()._str_len() - Note that you generally want to set this *after* changing levels, so - that it only acts on copies - """ - # GH 15110 - # Don't allow a single string for names in a MultiIndex - if names is not None and not is_list_like(names): - raise ValueError("Names should be list-like for a MultiIndex") - names = list(names) - - if validate: - if level is not None and len(names) != len(level): - raise ValueError("Length of names must match length of level.") - if level is None and len(names) != self.nlevels: - raise ValueError( - "Length of names must match number of levels in MultiIndex." - ) + result = pc.utf8_length(self._data) + return Int64Dtype().__from_arrow__(result) - if level is None: - level = range(self.nlevels) - else: - level = [self._get_level_number(lev) for lev in level] - - # set the name - for lev, name in zip(level, names): - if name is not None: - # GH 20527 - # All items in 'names' need to be hashable: - if not is_hashable(name): - raise TypeError( - f"{type(self).__name__}.name must be a hashable type" - ) - # error: Cannot determine type of '__setitem__' - self._names[lev] = name # type: ignore[has-type] - - # If .levels has been accessed, the names in our cache will be stale. - self._reset_cache() - - names = property( - fset=_set_names, - fget=_get_names, - doc=""" - Names of levels in MultiIndex. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays( - ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) - >>> mi - MultiIndex([(1, 3, 5), - (2, 4, 6)], - names=['x', 'y', 'z']) - >>> mi.names - FrozenList(['x', 'y', 'z']) - """, - ) - - # -------------------------------------------------------------------- - - @doc(Index._get_grouper_for_level) - def _get_grouper_for_level( - self, mapper, *, level=None - ) -> tuple[Index, npt.NDArray[np.signedinteger] | None, Index | None]: - indexer = self.codes[level] - level_index = self.levels[level] - - if mapper is not None: - # Handle group mapping function and return - level_values = self.levels[level].take(indexer) - grouper = level_values.map(mapper) - return grouper, None, None - - codes, uniques = algos.factorize(indexer, sort=True) - - if len(uniques) > 0 and uniques[0] == -1: - # Handle NAs - mask = indexer != -1 - ok_codes, uniques = algos.factorize(indexer[mask], sort=True) - - codes = np.empty(len(indexer), dtype=indexer.dtype) - codes[mask] = ok_codes - codes[~mask] = -1 - - if len(uniques) < len(level_index): - # Remove unobserved levels from level_index - level_index = level_index.take(uniques) - else: - # break references back to us so that setting the name - # on the output of a groupby doesn't reflect back here. - level_index = level_index.copy() + def _str_lower(self): + return type(self)(pc.utf8_lower(self._data)) - if level_index._can_hold_na: - grouper = level_index.take(codes, fill_value=True) - else: - grouper = level_index.take(codes) + def _str_upper(self): + return type(self)(pc.utf8_upper(self._data)) - return grouper, codes, level_index + def _str_strip(self, to_strip=None): + if pa_version_under4p0: + return super()._str_strip(to_strip) - @cache_readonly - def inferred_type(self) -> str: - return "mixed" + if to_strip is None: + result = pc.utf8_trim_whitespace(self._data) + else: + result = pc.utf8_trim(self._data, characters=to_strip) + return type(self)(result) - def _get_level_number(self, level) -> int: - count = self.names.count(level) - if (count > 1) and not is_integer(level): - raise ValueError( - f"The name {level} occurs multiple times, use a level number" - ) - try: - level = self.names.index(level) - except ValueError as err: - if not is_integer(level): - raise KeyError(f"Level {level} not found") from err - elif level < 0: - level += self.nlevels - if level < 0: - orig_level = level - self.nlevels - raise IndexError( - f"Too many levels: Index has only {self.nlevels} levels, " - f"{orig_level} is not a valid level number" - ) from err - # Note: levels are zero-based - elif level >= self.nlevels: - raise IndexError( - f"Too many levels: Index has only {self.nlevels} levels, " - f"not {level + 1}" - ) from err - return level + def _str_lstrip(self, to_strip=None): + if pa_version_under4p0: + return super()._str_lstrip(to_strip) - @cache_readonly - def is_monotonic_increasing(self) -> bool: - """ - return if the index is monotonic increasing (only equal or - increasing) values. - """ - if any(-1 in code for code in self.codes): - return False + if to_strip is None: + result = pc.utf8_ltrim_whitespace(self._data) + else: + result = pc.utf8_ltrim(self._data, characters=to_strip) + return type(self)(result) - if all(level.is_monotonic_increasing for level in self.levels): - # If each level is sorted, we can operate on the codes directly. GH27495 - return libalgos.is_lexsorted( - [x.astype("int64", copy=False) for x in self.codes] - ) + def _str_rstrip(self, to_strip=None): + if pa_version_under4p0: + return super()._str_rstrip(to_strip) - # reversed() because lexsort() wants the most significant key last. - values = [ - self._get_level_values(i)._values for i in reversed(range(len(self.levels))) - ] - try: - # Argument 1 to "lexsort" has incompatible type "List[Union[ExtensionArray, - # ndarray[Any, Any]]]"; expected "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], bool, - # int, float, complex, str, bytes, _NestedSequence[Union[bool, int, float, - # complex, str, bytes]]]" [arg-type] - sort_order = np.lexsort(values) # type: ignore[arg-type] - return Index(sort_order).is_monotonic_increasing - except TypeError: - - # we have mixed types and np.lexsort is not happy - return Index(self._values).is_monotonic_increasing - - @cache_readonly - def is_monotonic_decreasing(self) -> bool: - """ - return if the index is monotonic decreasing (only equal or - decreasing) values. - """ - # monotonic decreasing if and only if reverse is monotonic increasing - return self[::-1].is_monotonic_increasing - - @cache_readonly - def _inferred_type_levels(self) -> list[str]: - """return a list of the inferred types, one for each level""" - return [i.inferred_type for i in self.levels] - - @doc(Index.duplicated) - def duplicated(self, keep="first") -> npt.NDArray[np.bool_]: - shape = tuple(len(lev) for lev in self.levels) - ids = get_group_index(self.codes, shape, sort=False, xnull=False) - - return duplicated(ids, keep) - - # error: Cannot override final attribute "_duplicated" - # (previously declared in base class "IndexOpsMixin") - _duplicated = duplicated # type: ignore[misc] - - def fillna(self, value=None, downcast=None): - """ - fillna is not implemented for MultiIndex - """ - raise NotImplementedError("isna is not defined for MultiIndex") - - @doc(Index.dropna) - def dropna(self, how: str = "any") -> MultiIndex: - nans = [level_codes == -1 for level_codes in self.codes] - if how == "any": - indexer = np.any(nans, axis=0) - elif how == "all": - indexer = np.all(nans, axis=0) - else: - raise ValueError(f"invalid how option: {how}") - - new_codes = [level_codes[~indexer] for level_codes in self.codes] - return self.set_codes(codes=new_codes) - - def _get_level_values(self, level: int, unique: bool = False) -> Index: - """ - Return vector of label values for requested level, - equal to the length of the index - - **this is an internal method** - - Parameters - ---------- - level : int - unique : bool, default False - if True, drop duplicated values - - Returns - ------- - Index - """ - lev = self.levels[level] - level_codes = self.codes[level] - name = self._names[level] - if unique: - level_codes = algos.unique(level_codes) - filled = algos.take_nd(lev._values, level_codes, fill_value=lev._na_value) - return lev._shallow_copy(filled, name=name) - - def get_level_values(self, level): - """ - Return vector of label values for requested level. - - Length of returned vector is equal to the length of the index. - - Parameters - ---------- - level : int or str - ``level`` is either the integer position of the level in the - MultiIndex, or the name of the level. - - Returns - ------- - values : Index - Values is a level of this MultiIndex converted to - a single :class:`Index` (or subclass thereof). - - Notes - ----- - If the level contains missing values, the result may be casted to - ``float`` with missing values specified as ``NaN``. This is because - the level is converted to a regular ``Index``. - - Examples - -------- - Create a MultiIndex: - - >>> mi = pd.MultiIndex.from_arrays((list('abc'), list('def'))) - >>> mi.names = ['level_1', 'level_2'] - - Get level values by supplying level as either integer or name: - - >>> mi.get_level_values(0) - Index(['a', 'b', 'c'], dtype='object', name='level_1') - >>> mi.get_level_values('level_2') - Index(['d', 'e', 'f'], dtype='object', name='level_2') - - If a level contains missing values, the return type of the level - maybe casted to ``float``. - - >>> pd.MultiIndex.from_arrays([[1, None, 2], [3, 4, 5]]).dtypes - level_0 int64 - level_1 int64 - dtype: object - >>> pd.MultiIndex.from_arrays([[1, None, 2], [3, 4, 5]]).get_level_values(0) - Float64Index([1.0, nan, 2.0], dtype='float64') - """ - level = self._get_level_number(level) - values = self._get_level_values(level) - return values - - @doc(Index.unique) - def unique(self, level=None): - - if level is None: - return super().unique() - else: - level = self._get_level_number(level) - return self._get_level_values(level=level, unique=True) - - def to_frame( - self, - index: bool = True, - name=lib.no_default, - allow_duplicates: bool = False, - ) -> DataFrame: - """ - Create a DataFrame with the levels of the MultiIndex as columns. - - Column ordering is determined by the DataFrame constructor with data as - a dict. - - Parameters - ---------- - index : bool, default True - Set the index of the returned DataFrame as the original MultiIndex. - - name : list / sequence of str, optional - The passed names should substitute index level names. - - allow_duplicates : bool, optional default False - Allow duplicate column labels to be created. - - .. versionadded:: 1.5.0 - - Returns - ------- - DataFrame : a DataFrame containing the original MultiIndex data. - - See Also - -------- - DataFrame : Two-dimensional, size-mutable, potentially heterogeneous - tabular data. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([['a', 'b'], ['c', 'd']]) - >>> mi - MultiIndex([('a', 'c'), - ('b', 'd')], - ) - - >>> df = mi.to_frame() - >>> df - 0 1 - a c a c - b d b d - - >>> df = mi.to_frame(index=False) - >>> df - 0 1 - 0 a c - 1 b d - - >>> df = mi.to_frame(name=['x', 'y']) - >>> df - x y - a c a c - b d b d - """ - from pandas import DataFrame - - if name is None: - warnings.warn( - "Explicitly passing `name=None` currently preserves the Index's name " - "or uses a default name of 0. This behaviour is deprecated, and in " - "the future `None` will be used as the name of the resulting " - "DataFrame column.", - FutureWarning, - stacklevel=find_stack_level(), - ) - name = lib.no_default - - if name is not lib.no_default: - if not is_list_like(name): - raise TypeError("'name' must be a list / sequence of column names.") - - if len(name) != len(self.levels): - raise ValueError( - "'name' should have same length as number of levels on index." - ) - idx_names = name - else: - idx_names = self._get_level_names() - - if not allow_duplicates and len(set(idx_names)) != len(idx_names): - raise ValueError( - "Cannot create duplicate column labels if allow_duplicates is False" - ) - - # Guarantee resulting column order - PY36+ dict maintains insertion order - result = DataFrame( - {level: self._get_level_values(level) for level in range(len(self.levels))}, - copy=False, - ) - result.columns = idx_names - - if index: - result.index = self - return result - - def to_flat_index(self) -> Index: - """ - Convert a MultiIndex to an Index of Tuples containing the level values. - - Returns - ------- - pd.Index - Index with the MultiIndex data represented in Tuples. - - See Also - -------- - MultiIndex.from_tuples : Convert flat index back to MultiIndex. - - Notes - ----- - This method will simply return the caller if called by anything other - than a MultiIndex. - - Examples - -------- - >>> index = pd.MultiIndex.from_product( - ... [['foo', 'bar'], ['baz', 'qux']], - ... names=['a', 'b']) - >>> index.to_flat_index() - Index([('foo', 'baz'), ('foo', 'qux'), - ('bar', 'baz'), ('bar', 'qux')], - dtype='object') - """ - return Index(self._values, tupleize_cols=False) - - def is_lexsorted(self) -> bool: - warnings.warn( - "MultiIndex.is_lexsorted is deprecated as a public function, " - "users should use MultiIndex.is_monotonic_increasing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._is_lexsorted() - - def _is_lexsorted(self) -> bool: - """ - Return True if the codes are lexicographically sorted. - - Returns - ------- - bool - - Examples - -------- - In the below examples, the first level of the MultiIndex is sorted because - a>> pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['d', 'e', 'f']]).is_lexsorted() - True - >>> pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['d', 'f', 'e']]).is_lexsorted() - True - - In case there is a tie, the lexicographical sorting looks - at the next level of the MultiIndex. - - >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'b', 'c']]).is_lexsorted() - True - >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'c', 'b']]).is_lexsorted() - False - >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], - ... ['aa', 'bb', 'aa', 'bb']]).is_lexsorted() - True - >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], - ... ['bb', 'aa', 'aa', 'bb']]).is_lexsorted() - False - """ - return self._lexsort_depth == self.nlevels - - @property - def lexsort_depth(self) -> int: - warnings.warn( - "MultiIndex.is_lexsorted is deprecated as a public function, " - "users should use MultiIndex.is_monotonic_increasing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._lexsort_depth - - @cache_readonly - def _lexsort_depth(self) -> int: - """ - Compute and return the lexsort_depth, the number of levels of the - MultiIndex that are sorted lexically - - Returns - ------- - int - """ - if self.sortorder is not None: - return self.sortorder - return _lexsort_depth(self.codes, self.nlevels) - - def _sort_levels_monotonic(self) -> MultiIndex: - """ - This is an *internal* function. - - Create a new MultiIndex from the current to monotonically sorted - items IN the levels. This does not actually make the entire MultiIndex - monotonic, JUST the levels. - - The resulting MultiIndex will have the same outward - appearance, meaning the same .values and ordering. It will also - be .equals() to the original. - - Returns - ------- - MultiIndex - - Examples - -------- - >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) - >>> mi - MultiIndex([('a', 'bb'), - ('a', 'aa'), - ('b', 'bb'), - ('b', 'aa')], - ) - - >>> mi.sort_values() - MultiIndex([('a', 'aa'), - ('a', 'bb'), - ('b', 'aa'), - ('b', 'bb')], - ) - """ - if self._is_lexsorted() and self.is_monotonic_increasing: - return self - - new_levels = [] - new_codes = [] - - for lev, level_codes in zip(self.levels, self.codes): - - if not lev.is_monotonic_increasing: - try: - # indexer to reorder the levels - indexer = lev.argsort() - except TypeError: - pass - else: - lev = lev.take(indexer) - - # indexer to reorder the level codes - indexer = ensure_platform_int(indexer) - ri = lib.get_reverse_indexer(indexer, len(indexer)) - level_codes = algos.take_nd(ri, level_codes) - - new_levels.append(lev) - new_codes.append(level_codes) - - return MultiIndex( - new_levels, - new_codes, - names=self.names, - sortorder=self.sortorder, - verify_integrity=False, - ) - - def remove_unused_levels(self) -> MultiIndex: - """ - Create new MultiIndex from current that removes unused levels. - - Unused level(s) means levels that are not expressed in the - labels. The resulting MultiIndex will have the same outward - appearance, meaning the same .values and ordering. It will - also be .equals() to the original. - - Returns - ------- - MultiIndex - - Examples - -------- - >>> mi = pd.MultiIndex.from_product([range(2), list('ab')]) - >>> mi - MultiIndex([(0, 'a'), - (0, 'b'), - (1, 'a'), - (1, 'b')], - ) - - >>> mi[2:] - MultiIndex([(1, 'a'), - (1, 'b')], - ) - - The 0 from the first level is not represented - and can be removed - - >>> mi2 = mi[2:].remove_unused_levels() - >>> mi2.levels - FrozenList([[1], ['a', 'b']]) - """ - new_levels = [] - new_codes = [] - - changed = False - for lev, level_codes in zip(self.levels, self.codes): - - # Since few levels are typically unused, bincount() is more - # efficient than unique() - however it only accepts positive values - # (and drops order): - uniques = np.where(np.bincount(level_codes + 1) > 0)[0] - 1 - has_na = int(len(uniques) and (uniques[0] == -1)) - - if len(uniques) != len(lev) + has_na: - - if lev.isna().any() and len(uniques) == len(lev): - break - # We have unused levels - changed = True - - # Recalculate uniques, now preserving order. - # Can easily be cythonized by exploiting the already existing - # "uniques" and stop parsing "level_codes" when all items - # are found: - uniques = algos.unique(level_codes) - if has_na: - na_idx = np.where(uniques == -1)[0] - # Just ensure that -1 is in first position: - uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]] - - # codes get mapped from uniques to 0:len(uniques) - # -1 (if present) is mapped to last position - code_mapping = np.zeros(len(lev) + has_na) - # ... and reassigned value -1: - code_mapping[uniques] = np.arange(len(uniques)) - has_na - - level_codes = code_mapping[level_codes] - - # new levels are simple - lev = lev.take(uniques[has_na:]) - - new_levels.append(lev) - new_codes.append(level_codes) - - result = self.view() - - if changed: - result._reset_identity() - result._set_levels(new_levels, validate=False) - result._set_codes(new_codes, validate=False) - - return result - - # -------------------------------------------------------------------- - # Pickling Methods - - def __reduce__(self): - """Necessary for making this object picklable""" - d = { - "levels": list(self.levels), - "codes": list(self.codes), - "sortorder": self.sortorder, - "names": list(self.names), - } - return ibase._new_Index, (type(self), d), None - - # -------------------------------------------------------------------- - - def __getitem__(self, key): - if is_scalar(key): - key = com.cast_scalar_indexer(key, warn_float=True) - - retval = [] - for lev, level_codes in zip(self.levels, self.codes): - if level_codes[key] == -1: - retval.append(np.nan) - else: - retval.append(lev[level_codes[key]]) - - return tuple(retval) - else: - # in general cannot be sure whether the result will be sorted - sortorder = None - if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) - sortorder = self.sortorder - elif isinstance(key, slice): - if key.step is None or key.step > 0: - sortorder = self.sortorder - elif isinstance(key, Index): - key = np.asarray(key) - - new_codes = [level_codes[key] for level_codes in self.codes] - - return MultiIndex( - levels=self.levels, - codes=new_codes, - names=self.names, - sortorder=sortorder, - verify_integrity=False, - ) - - def _getitem_slice(self: MultiIndex, slobj: slice) -> MultiIndex: - """ - Fastpath for __getitem__ when we know we have a slice. - """ - sortorder = None - if slobj.step is None or slobj.step > 0: - sortorder = self.sortorder - - new_codes = [level_codes[slobj] for level_codes in self.codes] - - return type(self)( - levels=self.levels, - codes=new_codes, - names=self._names, - sortorder=sortorder, - verify_integrity=False, - ) - - @Appender(_index_shared_docs["take"] % _index_doc_kwargs) - def take( - self: MultiIndex, - indices, - axis: int = 0, - allow_fill: bool = True, - fill_value=None, - **kwargs, - ) -> MultiIndex: - nv.validate_take((), kwargs) - indices = ensure_platform_int(indices) - - # only fill if we are passing a non-None fill_value - allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices) - - na_value = -1 - - taken = [lab.take(indices) for lab in self.codes] - if allow_fill: - mask = indices == -1 - if mask.any(): - masked = [] - for new_label in taken: - label_values = new_label - label_values[mask] = na_value - masked.append(np.asarray(label_values)) - taken = masked - - return MultiIndex( - levels=self.levels, codes=taken, names=self.names, verify_integrity=False - ) - - def append(self, other): - """ - Append a collection of Index options together - - Parameters - ---------- - other : Index or list/tuple of indices - - Returns - ------- - appended : Index - """ - if not isinstance(other, (list, tuple)): - other = [other] - - if all( - (isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) for o in other - ): - arrays = [] - for i in range(self.nlevels): - label = self._get_level_values(i) - appended = [o._get_level_values(i) for o in other] - arrays.append(label.append(appended)) - return MultiIndex.from_arrays(arrays, names=self.names) - - to_concat = (self._values,) + tuple(k._values for k in other) - new_tuples = np.concatenate(to_concat) - - # if all(isinstance(x, MultiIndex) for x in other): - try: - return MultiIndex.from_tuples(new_tuples, names=self.names) - except (TypeError, IndexError): - return Index._with_infer(new_tuples) - - def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: - return self._values.argsort(*args, **kwargs) - - @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) - def repeat(self, repeats: int, axis=None) -> MultiIndex: - nv.validate_repeat((), {"axis": axis}) - # error: Incompatible types in assignment (expression has type "ndarray", - # variable has type "int") - repeats = ensure_platform_int(repeats) # type: ignore[assignment] - return MultiIndex( - levels=self.levels, - codes=[ - level_codes.view(np.ndarray).astype(np.intp, copy=False).repeat(repeats) - for level_codes in self.codes - ], - names=self.names, - sortorder=self.sortorder, - verify_integrity=False, - ) - - def drop(self, codes, level=None, errors="raise"): - """ - Make new MultiIndex with passed list of codes deleted - - Parameters - ---------- - codes : array-like - Must be a list of tuples when level is not specified - level : int or level name, default None - errors : str, default 'raise' - - Returns - ------- - dropped : MultiIndex - """ - if level is not None: - return self._drop_from_level(codes, level, errors) - - if not isinstance(codes, (np.ndarray, Index)): - try: - codes = com.index_labels_to_array(codes, dtype=np.dtype("object")) - except ValueError: - pass - - inds = [] - for level_codes in codes: - try: - loc = self.get_loc(level_codes) - # get_loc returns either an integer, a slice, or a boolean - # mask - if isinstance(loc, int): - inds.append(loc) - elif isinstance(loc, slice): - step = loc.step if loc.step is not None else 1 - inds.extend(range(loc.start, loc.stop, step)) - elif com.is_bool_indexer(loc): - if self._lexsort_depth == 0: - warnings.warn( - "dropping on a non-lexsorted multi-index " - "without a level parameter may impact performance.", - PerformanceWarning, - stacklevel=find_stack_level(), - ) - loc = loc.nonzero()[0] - inds.extend(loc) - else: - msg = f"unsupported indexer of type {type(loc)}" - raise AssertionError(msg) - except KeyError: - if errors != "ignore": - raise - - return self.delete(inds) - - def _drop_from_level(self, codes, level, errors="raise") -> MultiIndex: - codes = com.index_labels_to_array(codes) - i = self._get_level_number(level) - index = self.levels[i] - values = index.get_indexer(codes) - # If nan should be dropped it will equal -1 here. We have to check which values - # are not nan and equal -1, this means they are missing in the index - nan_codes = isna(codes) - values[(np.equal(nan_codes, False)) & (values == -1)] = -2 - if index.shape[0] == self.shape[0]: - values[np.equal(nan_codes, True)] = -2 - - not_found = codes[values == -2] - if len(not_found) != 0 and errors != "ignore": - raise KeyError(f"labels {not_found} not found in level") - mask = ~algos.isin(self.codes[i], values) - - return self[mask] - - def swaplevel(self, i=-2, j=-1) -> MultiIndex: - """ - Swap level i with level j. - - Calling this method does not change the ordering of the values. - - Parameters - ---------- - i : int, str, default -2 - First level of index to be swapped. Can pass level name as string. - Type of parameters can be mixed. - j : int, str, default -1 - Second level of index to be swapped. Can pass level name as string. - Type of parameters can be mixed. - - Returns - ------- - MultiIndex - A new MultiIndex. - - See Also - -------- - Series.swaplevel : Swap levels i and j in a MultiIndex. - DataFrame.swaplevel : Swap levels i and j in a MultiIndex on a - particular axis. - - Examples - -------- - >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) - >>> mi - MultiIndex([('a', 'bb'), - ('a', 'aa'), - ('b', 'bb'), - ('b', 'aa')], - ) - >>> mi.swaplevel(0, 1) - MultiIndex([('bb', 'a'), - ('aa', 'a'), - ('bb', 'b'), - ('aa', 'b')], - ) - """ - new_levels = list(self.levels) - new_codes = list(self.codes) - new_names = list(self.names) - - i = self._get_level_number(i) - j = self._get_level_number(j) - - new_levels[i], new_levels[j] = new_levels[j], new_levels[i] - new_codes[i], new_codes[j] = new_codes[j], new_codes[i] - new_names[i], new_names[j] = new_names[j], new_names[i] - - return MultiIndex( - levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False - ) - - def reorder_levels(self, order) -> MultiIndex: - """ - Rearrange levels using input order. May not drop or duplicate levels. - - Parameters - ---------- - order : list of int or list of str - List representing new level order. Reference level by number - (position) or by key (label). - - Returns - ------- - MultiIndex - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=['x', 'y']) - >>> mi - MultiIndex([(1, 3), - (2, 4)], - names=['x', 'y']) - - >>> mi.reorder_levels(order=[1, 0]) - MultiIndex([(3, 1), - (4, 2)], - names=['y', 'x']) - - >>> mi.reorder_levels(order=['y', 'x']) - MultiIndex([(3, 1), - (4, 2)], - names=['y', 'x']) - """ - order = [self._get_level_number(i) for i in order] - if len(order) != self.nlevels: - raise AssertionError( - f"Length of order must be same as number of levels ({self.nlevels}), " - f"got {len(order)}" - ) - new_levels = [self.levels[i] for i in order] - new_codes = [self.codes[i] for i in order] - new_names = [self.names[i] for i in order] - - return MultiIndex( - levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False - ) - - def _get_codes_for_sorting(self) -> list[Categorical]: - """ - we are categorizing our codes by using the - available categories (all, not just observed) - excluding any missing ones (-1); this is in preparation - for sorting, where we need to disambiguate that -1 is not - a valid valid - """ - - def cats(level_codes): - return np.arange( - np.array(level_codes).max() + 1 if len(level_codes) else 0, - dtype=level_codes.dtype, - ) - - return [ - Categorical.from_codes(level_codes, cats(level_codes), ordered=True) - for level_codes in self.codes - ] - - def sortlevel( - self, level=0, ascending: bool = True, sort_remaining: bool = True - ) -> tuple[MultiIndex, npt.NDArray[np.intp]]: - """ - Sort MultiIndex at the requested level. - - The result will respect the original ordering of the associated - factor at that level. - - Parameters - ---------- - level : list-like, int or str, default 0 - If a string is given, must be a name of the level. - If list-like must be names or ints of levels. - ascending : bool, default True - False to sort in descending order. - Can also be a list to specify a directed ordering. - sort_remaining : sort by the remaining levels after level - - Returns - ------- - sorted_index : pd.MultiIndex - Resulting index. - indexer : np.ndarray[np.intp] - Indices of output values in original index. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([[0, 0], [2, 1]]) - >>> mi - MultiIndex([(0, 2), - (0, 1)], - ) - - >>> mi.sortlevel() - (MultiIndex([(0, 1), - (0, 2)], - ), array([1, 0])) - - >>> mi.sortlevel(sort_remaining=False) - (MultiIndex([(0, 2), - (0, 1)], - ), array([0, 1])) - - >>> mi.sortlevel(1) - (MultiIndex([(0, 1), - (0, 2)], - ), array([1, 0])) - - >>> mi.sortlevel(1, ascending=False) - (MultiIndex([(0, 2), - (0, 1)], - ), array([0, 1])) - """ - if isinstance(level, (str, int)): - level = [level] - level = [self._get_level_number(lev) for lev in level] - sortorder = None - - # we have a directed ordering via ascending - if isinstance(ascending, list): - if not len(level) == len(ascending): - raise ValueError("level must have same length as ascending") - - indexer = lexsort_indexer( - [self.codes[lev] for lev in level], orders=ascending - ) - - # level ordering - else: - - codes = list(self.codes) - shape = list(self.levshape) - - # partition codes and shape - primary = tuple(codes[lev] for lev in level) - primshp = tuple(shape[lev] for lev in level) - - # Reverse sorted to retain the order of - # smaller indices that needs to be removed - for lev in sorted(level, reverse=True): - codes.pop(lev) - shape.pop(lev) - - if sort_remaining: - primary += primary + tuple(codes) - primshp += primshp + tuple(shape) - else: - sortorder = level[0] - - indexer = indexer_from_factorized(primary, primshp, compress=False) - - if not ascending: - indexer = indexer[::-1] - - indexer = ensure_platform_int(indexer) - new_codes = [level_codes.take(indexer) for level_codes in self.codes] - - new_index = MultiIndex( - codes=new_codes, - levels=self.levels, - names=self.names, - sortorder=sortorder, - verify_integrity=False, - ) - - return new_index, indexer - - def _wrap_reindex_result(self, target, indexer, preserve_names: bool): - if not isinstance(target, MultiIndex): - if indexer is None: - target = self - elif (indexer >= 0).all(): - target = self.take(indexer) - else: - try: - target = MultiIndex.from_tuples(target) - except TypeError: - # not all tuples, see test_constructor_dict_multiindex_reindex_flat - return target - - target = self._maybe_preserve_names(target, preserve_names) - return target - - def _maybe_preserve_names(self, target: Index, preserve_names: bool) -> Index: - if ( - preserve_names - and target.nlevels == self.nlevels - and target.names != self.names - ): - target = target.copy(deep=False) - target.names = self.names - return target - - # -------------------------------------------------------------------- - # Indexing Methods - - def _check_indexing_error(self, key) -> None: - if not is_hashable(key) or is_iterator(key): - # We allow tuples if they are hashable, whereas other Index - # subclasses require scalar. - # We have to explicitly exclude generators, as these are hashable. - raise InvalidIndexError(key) - - @cache_readonly - def _should_fallback_to_positional(self) -> bool: - """ - Should integer key(s) be treated as positional? - """ - # GH#33355 - return self.levels[0]._should_fallback_to_positional - - def _get_values_for_loc(self, series: Series, loc, key): - """ - Do a positional lookup on the given Series, returning either a scalar - or a Series. - - Assumes that `series.index is self` - """ - new_values = series._values[loc] - if is_scalar(loc): - return new_values - - if len(new_values) == 1 and not self.nlevels > 1: - # If more than one level left, we can not return a scalar - return new_values[0] - - new_index = self[loc] - new_index = maybe_droplevels(new_index, key) - new_ser = series._constructor(new_values, index=new_index, name=series.name) - return new_ser.__finalize__(series) - - def _get_indexer_strict( - self, key, axis_name: str - ) -> tuple[Index, npt.NDArray[np.intp]]: - - keyarr = key - if not isinstance(keyarr, Index): - keyarr = com.asarray_tuplesafe(keyarr) - - if len(keyarr) and not isinstance(keyarr[0], tuple): - indexer = self._get_indexer_level_0(keyarr) - - self._raise_if_missing(key, indexer, axis_name) - return self[indexer], indexer - - return super()._get_indexer_strict(key, axis_name) - - def _raise_if_missing(self, key, indexer, axis_name: str) -> None: - keyarr = key - if not isinstance(key, Index): - keyarr = com.asarray_tuplesafe(key) - - if len(keyarr) and not isinstance(keyarr[0], tuple): - # i.e. same condition for special case in MultiIndex._get_indexer_strict - - mask = indexer == -1 - if mask.any(): - check = self.levels[0].get_indexer(keyarr) - cmask = check == -1 - if cmask.any(): - raise KeyError(f"{keyarr[cmask]} not in index") - # We get here when levels still contain values which are not - # actually in Index anymore - raise KeyError(f"{keyarr} not in index") - else: - return super()._raise_if_missing(key, indexer, axis_name) - - def _get_indexer_level_0(self, target) -> npt.NDArray[np.intp]: - """ - Optimized equivalent to `self.get_level_values(0).get_indexer_for(target)`. - """ - lev = self.levels[0] - codes = self._codes[0] - cat = Categorical.from_codes(codes=codes, categories=lev) - ci = Index(cat) - return ci.get_indexer_for(target) - - def get_slice_bound( - self, label: Hashable | Sequence[Hashable], side: str, kind=lib.no_default - ) -> int: - """ - For an ordered MultiIndex, compute slice bound - that corresponds to given label. - - Returns leftmost (one-past-the-rightmost if `side=='right') position - of given label. - - Parameters - ---------- - label : object or tuple of objects - side : {'left', 'right'} - kind : {'loc', 'getitem', None} - - .. deprecated:: 1.4.0 - - Returns - ------- - int - Index of label. - - Notes - ----- - This method only works if level 0 index of the MultiIndex is lexsorted. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([list('abbc'), list('gefd')]) - - Get the locations from the leftmost 'b' in the first level - until the end of the multiindex: - - >>> mi.get_slice_bound('b', side="left") - 1 - - Like above, but if you get the locations from the rightmost - 'b' in the first level and 'f' in the second level: - - >>> mi.get_slice_bound(('b','f'), side="right") - 3 - - See Also - -------- - MultiIndex.get_loc : Get location for a label or a tuple of labels. - MultiIndex.get_locs : Get location for a label/slice/list/mask or a - sequence of such. - """ - self._deprecated_arg(kind, "kind", "get_slice_bound") - - if not isinstance(label, tuple): - label = (label,) - return self._partial_tup_index(label, side=side) - - def slice_locs( - self, start=None, end=None, step=None, kind=lib.no_default - ) -> tuple[int, int]: - """ - For an ordered MultiIndex, compute the slice locations for input - labels. - - The input labels can be tuples representing partial levels, e.g. for a - MultiIndex with 3 levels, you can pass a single value (corresponding to - the first level), or a 1-, 2-, or 3-tuple. - - Parameters - ---------- - start : label or tuple, default None - If None, defaults to the beginning - end : label or tuple - If None, defaults to the end - step : int or None - Slice step - kind : string, optional, defaults None - - .. deprecated:: 1.4.0 - - Returns - ------- - (start, end) : (int, int) - - Notes - ----- - This method only works if the MultiIndex is properly lexsorted. So, - if only the first 2 levels of a 3-level MultiIndex are lexsorted, - you can only pass two levels to ``.slice_locs``. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([list('abbd'), list('deff')], - ... names=['A', 'B']) - - Get the slice locations from the beginning of 'b' in the first level - until the end of the multiindex: - - >>> mi.slice_locs(start='b') - (1, 4) - - Like above, but stop at the end of 'b' in the first level and 'f' in - the second level: - - >>> mi.slice_locs(start='b', end=('b', 'f')) - (1, 3) - - See Also - -------- - MultiIndex.get_loc : Get location for a label or a tuple of labels. - MultiIndex.get_locs : Get location for a label/slice/list/mask or a - sequence of such. - """ - self._deprecated_arg(kind, "kind", "slice_locs") - # This function adds nothing to its parent implementation (the magic - # happens in get_slice_bound method), but it adds meaningful doc. - return super().slice_locs(start, end, step) - - def _partial_tup_index(self, tup: tuple, side="left"): - if len(tup) > self._lexsort_depth: - raise UnsortedIndexError( - f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth " - f"({self._lexsort_depth})" - ) - - n = len(tup) - start, end = 0, len(self) - zipped = zip(tup, self.levels, self.codes) - for k, (lab, lev, level_codes) in enumerate(zipped): - section = level_codes[start:end] - - if lab not in lev and not isna(lab): - # short circuit - try: - loc = algos.searchsorted(lev, lab, side=side) - except TypeError as err: - # non-comparable e.g. test_slice_locs_with_type_mismatch - raise TypeError(f"Level type mismatch: {lab}") from err - if not is_integer(loc): - # non-comparable level, e.g. test_groupby_example - raise TypeError(f"Level type mismatch: {lab}") - if side == "right" and loc >= 0: - loc -= 1 - return start + algos.searchsorted(section, loc, side=side) - - idx = self._get_loc_single_level_index(lev, lab) - if isinstance(idx, slice) and k < n - 1: - # Get start and end value from slice, necessary when a non-integer - # interval is given as input GH#37707 - start = idx.start - end = idx.stop - elif k < n - 1: - # error: Incompatible types in assignment (expression has type - # "Union[ndarray[Any, dtype[signedinteger[Any]]] - end = start + algos.searchsorted( # type: ignore[assignment] - section, idx, side="right" - ) - # error: Incompatible types in assignment (expression has type - # "Union[ndarray[Any, dtype[signedinteger[Any]]] - start = start + algos.searchsorted( # type: ignore[assignment] - section, idx, side="left" - ) - elif isinstance(idx, slice): - idx = idx.start - return start + algos.searchsorted(section, idx, side=side) - else: - return start + algos.searchsorted(section, idx, side=side) - - def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int: - """ - If key is NA value, location of index unify as -1. - - Parameters - ---------- - level_index: Index - key : label - - Returns - ------- - loc : int - If key is NA value, loc is -1 - Else, location of key in index. - - See Also - -------- - Index.get_loc : The get_loc method for (single-level) index. - """ - if is_scalar(key) and isna(key): - return -1 - else: - return level_index.get_loc(key) - - def get_loc(self, key, method=None): - """ - Get location for a label or a tuple of labels. - - The location is returned as an integer/slice or boolean - mask. - - Parameters - ---------- - key : label or tuple of labels (one for each level) - method : None - - Returns - ------- - loc : int, slice object or boolean mask - If the key is past the lexsort depth, the return may be a - boolean mask array, otherwise it is always a slice or int. - - See Also - -------- - Index.get_loc : The get_loc method for (single-level) index. - MultiIndex.slice_locs : Get slice location given start label(s) and - end label(s). - MultiIndex.get_locs : Get location for a label/slice/list/mask or a - sequence of such. - - Notes - ----- - The key cannot be a slice, list of same-level labels, a boolean mask, - or a sequence of such. If you want to use those, use - :meth:`MultiIndex.get_locs` instead. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')]) - - >>> mi.get_loc('b') - slice(1, 3, None) - - >>> mi.get_loc(('b', 'e')) - 1 - """ - if method is not None: - raise NotImplementedError( - "only the default get_loc method is " - "currently supported for MultiIndex" - ) - - self._check_indexing_error(key) - - def _maybe_to_slice(loc): - """convert integer indexer to boolean mask or slice if possible""" - if not isinstance(loc, np.ndarray) or loc.dtype != np.intp: - return loc - - loc = lib.maybe_indices_to_slice(loc, len(self)) - if isinstance(loc, slice): - return loc - - mask = np.empty(len(self), dtype="bool") - mask.fill(False) - mask[loc] = True - return mask - - if not isinstance(key, tuple): - loc = self._get_level_indexer(key, level=0) - return _maybe_to_slice(loc) - - keylen = len(key) - if self.nlevels < keylen: - raise KeyError( - f"Key length ({keylen}) exceeds index depth ({self.nlevels})" - ) - - if keylen == self.nlevels and self.is_unique: - try: - return self._engine.get_loc(key) - except TypeError: - # e.g. test_partial_slicing_with_multiindex partial string slicing - loc, _ = self.get_loc_level(key, list(range(self.nlevels))) - return loc - - # -- partial selection or non-unique index - # break the key into 2 parts based on the lexsort_depth of the index; - # the first part returns a continuous slice of the index; the 2nd part - # needs linear search within the slice - i = self._lexsort_depth - lead_key, follow_key = key[:i], key[i:] - - if not lead_key: - start = 0 - stop = len(self) - else: - try: - start, stop = self.slice_locs(lead_key, lead_key) - except TypeError as err: - # e.g. test_groupby_example key = ((0, 0, 1, 2), "new_col") - # when self has 5 integer levels - raise KeyError(key) from err - - if start == stop: - raise KeyError(key) - - if not follow_key: - return slice(start, stop) - - warnings.warn( - "indexing past lexsort depth may impact performance.", - PerformanceWarning, - stacklevel=find_stack_level(), - ) - - loc = np.arange(start, stop, dtype=np.intp) - - for i, k in enumerate(follow_key, len(lead_key)): - mask = self.codes[i][loc] == self._get_loc_single_level_index( - self.levels[i], k - ) - if not mask.all(): - loc = loc[mask] - if not len(loc): - raise KeyError(key) - - return _maybe_to_slice(loc) if len(loc) != stop - start else slice(start, stop) - - def get_loc_level(self, key, level=0, drop_level: bool = True): - """ - Get location and sliced index for requested label(s)/level(s). - - Parameters - ---------- - key : label or sequence of labels - level : int/level name or list thereof, optional - drop_level : bool, default True - If ``False``, the resulting index will not drop any level. - - Returns - ------- - loc : A 2-tuple where the elements are: - Element 0: int, slice object or boolean array - Element 1: The resulting sliced multiindex/index. If the key - contains all levels, this will be ``None``. - - See Also - -------- - MultiIndex.get_loc : Get location for a label or a tuple of labels. - MultiIndex.get_locs : Get location for a label/slice/list/mask or a - sequence of such. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')], - ... names=['A', 'B']) - - >>> mi.get_loc_level('b') - (slice(1, 3, None), Index(['e', 'f'], dtype='object', name='B')) - - >>> mi.get_loc_level('e', level='B') - (array([False, True, False]), Index(['b'], dtype='object', name='A')) - - >>> mi.get_loc_level(['b', 'e']) - (1, None) - """ - if not isinstance(level, (list, tuple)): - level = self._get_level_number(level) - else: - level = [self._get_level_number(lev) for lev in level] - - loc, mi = self._get_loc_level(key, level=level) - if not drop_level: - if lib.is_integer(loc): - mi = self[loc : loc + 1] - else: - mi = self[loc] - return loc, mi - - def _get_loc_level(self, key, level: int | list[int] = 0): - """ - get_loc_level but with `level` known to be positional, not name-based. - """ - - # different name to distinguish from maybe_droplevels - def maybe_mi_droplevels(indexer, levels): - """ - If level does not exist or all levels were dropped, the exception - has to be handled outside. - """ - new_index = self[indexer] - - for i in sorted(levels, reverse=True): - new_index = new_index._drop_level_numbers([i]) - - return new_index - - if isinstance(level, (tuple, list)): - if len(key) != len(level): - raise AssertionError( - "Key for location must have same length as number of levels" - ) - result = None - for lev, k in zip(level, key): - loc, new_index = self._get_loc_level(k, level=lev) - if isinstance(loc, slice): - mask = np.zeros(len(self), dtype=bool) - mask[loc] = True - loc = mask - result = loc if result is None else result & loc - - try: - # FIXME: we should be only dropping levels on which we are - # scalar-indexing - mi = maybe_mi_droplevels(result, level) - except ValueError: - # droplevel failed because we tried to drop all levels, - # i.e. len(level) == self.nlevels - mi = self[result] - - return result, mi - - # kludge for #1796 - if isinstance(key, list): - key = tuple(key) - - if isinstance(key, tuple) and level == 0: - - try: - # Check if this tuple is a single key in our first level - if key in self.levels[0]: - indexer = self._get_level_indexer(key, level=level) - new_index = maybe_mi_droplevels(indexer, [0]) - return indexer, new_index - except (TypeError, InvalidIndexError): - pass - - if not any(isinstance(k, slice) for k in key): - - if len(key) == self.nlevels and self.is_unique: - # Complete key in unique index -> standard get_loc - try: - return (self._engine.get_loc(key), None) - except KeyError as err: - raise KeyError(key) from err - except TypeError: - # e.g. partial string indexing - # test_partial_string_timestamp_multiindex - pass - - # partial selection - indexer = self.get_loc(key) - ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)] - if len(ilevels) == self.nlevels: - if is_integer(indexer): - # we are dropping all levels - return indexer, None - - # TODO: in some cases we still need to drop some levels, - # e.g. test_multiindex_perf_warn - # test_partial_string_timestamp_multiindex - ilevels = [ - i - for i in range(len(key)) - if ( - not isinstance(key[i], str) - or not self.levels[i]._supports_partial_string_indexing - ) - and key[i] != slice(None, None) - ] - if len(ilevels) == self.nlevels: - # TODO: why? - ilevels = [] - return indexer, maybe_mi_droplevels(indexer, ilevels) - - else: - indexer = None - for i, k in enumerate(key): - if not isinstance(k, slice): - loc_level = self._get_level_indexer(k, level=i) - if isinstance(loc_level, slice): - if com.is_null_slice(loc_level) or com.is_full_slice( - loc_level, len(self) - ): - # everything - continue - else: - # e.g. test_xs_IndexSlice_argument_not_implemented - k_index = np.zeros(len(self), dtype=bool) - k_index[loc_level] = True - - else: - k_index = loc_level - - elif com.is_null_slice(k): - # taking everything, does not affect `indexer` below - continue - - else: - # FIXME: this message can be inaccurate, e.g. - # test_series_varied_multiindex_alignment - raise TypeError(f"Expected label or tuple of labels, got {key}") - - if indexer is None: - indexer = k_index - else: - indexer &= k_index - if indexer is None: - indexer = slice(None, None) - ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)] - return indexer, maybe_mi_droplevels(indexer, ilevels) - else: - indexer = self._get_level_indexer(key, level=level) - if ( - isinstance(key, str) - and self.levels[level]._supports_partial_string_indexing - ): - # check to see if we did an exact lookup vs sliced - check = self.levels[level].get_loc(key) - if not is_integer(check): - # e.g. test_partial_string_timestamp_multiindex - return indexer, self[indexer] - - try: - result_index = maybe_mi_droplevels(indexer, [level]) - except ValueError: - result_index = self[indexer] - - return indexer, result_index - - def _get_level_indexer( - self, key, level: int = 0, indexer: Int64Index | None = None - ): - # `level` kwarg is _always_ positional, never name - # return an indexer, boolean array or a slice showing where the key is - # in the totality of values - # if the indexer is provided, then use this - - level_index = self.levels[level] - level_codes = self.codes[level] - - def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): - # given the inputs and the codes/indexer, compute an indexer set - # if we have a provided indexer, then this need not consider - # the entire labels set - r = np.arange(start, stop, step) - - if indexer is not None and len(indexer) != len(codes): - - # we have an indexer which maps the locations in the labels - # that we have already selected (and is not an indexer for the - # entire set) otherwise this is wasteful so we only need to - # examine locations that are in this set the only magic here is - # that the result are the mappings to the set that we have - # selected - from pandas import Series - - mapper = Series(indexer) - indexer = codes.take(ensure_platform_int(indexer)) - result = Series(Index(indexer).isin(r).nonzero()[0]) - m = result.map(mapper) - # error: Incompatible types in assignment (expression has type - # "ndarray", variable has type "Series") - m = np.asarray(m) # type: ignore[assignment] - - else: - # error: Incompatible types in assignment (expression has type - # "ndarray", variable has type "Series") - m = np.zeros(len(codes), dtype=bool) # type: ignore[assignment] - m[np.in1d(codes, r, assume_unique=Index(codes).is_unique)] = True - - return m - - if isinstance(key, slice): - # handle a slice, returning a slice if we can - # otherwise a boolean indexer - step = key.step - is_negative_step = step is not None and step < 0 - - try: - if key.start is not None: - start = level_index.get_loc(key.start) - elif is_negative_step: - start = len(level_index) - 1 - else: - start = 0 - - if key.stop is not None: - stop = level_index.get_loc(key.stop) - elif is_negative_step: - stop = 0 - elif isinstance(start, slice): - stop = len(level_index) - else: - stop = len(level_index) - 1 - except KeyError: - - # we have a partial slice (like looking up a partial date - # string) - start = stop = level_index.slice_indexer(key.start, key.stop, key.step) - step = start.step - - if isinstance(start, slice) or isinstance(stop, slice): - # we have a slice for start and/or stop - # a partial date slicer on a DatetimeIndex generates a slice - # note that the stop ALREADY includes the stopped point (if - # it was a string sliced) - start = getattr(start, "start", start) - stop = getattr(stop, "stop", stop) - return convert_indexer(start, stop, step) - - elif level > 0 or self._lexsort_depth == 0 or step is not None: - # need to have like semantics here to right - # searching as when we are using a slice - # so adjust the stop by 1 (so we include stop) - stop = (stop - 1) if is_negative_step else (stop + 1) - return convert_indexer(start, stop, step) - else: - # sorted, so can return slice object -> view - i = algos.searchsorted(level_codes, start, side="left") - j = algos.searchsorted(level_codes, stop, side="right") - return slice(i, j, step) - - else: - - idx = self._get_loc_single_level_index(level_index, key) - - if level > 0 or self._lexsort_depth == 0: - # Desired level is not sorted - if isinstance(idx, slice): - # test_get_loc_partial_timestamp_multiindex - locs = (level_codes >= idx.start) & (level_codes < idx.stop) - return locs - - locs = np.array(level_codes == idx, dtype=bool, copy=False) - - if not locs.any(): - # The label is present in self.levels[level] but unused: - raise KeyError(key) - return locs - - if isinstance(idx, slice): - # e.g. test_partial_string_timestamp_multiindex - start = algos.searchsorted(level_codes, idx.start, side="left") - # NB: "left" here bc of slice semantics - end = algos.searchsorted(level_codes, idx.stop, side="left") - else: - start = algos.searchsorted(level_codes, idx, side="left") - end = algos.searchsorted(level_codes, idx, side="right") - - if start == end: - # The label is present in self.levels[level] but unused: - raise KeyError(key) - return slice(start, end) - - def get_locs(self, seq): - """ - Get location for a sequence of labels. - - Parameters - ---------- - seq : label, slice, list, mask or a sequence of such - You should use one of the above for each level. - If a level should not be used, set it to ``slice(None)``. - - Returns - ------- - numpy.ndarray - NumPy array of integers suitable for passing to iloc. - - See Also - -------- - MultiIndex.get_loc : Get location for a label or a tuple of labels. - MultiIndex.slice_locs : Get slice location given start label(s) and - end label(s). - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')]) - - >>> mi.get_locs('b') # doctest: +SKIP - array([1, 2], dtype=int64) - - >>> mi.get_locs([slice(None), ['e', 'f']]) # doctest: +SKIP - array([1, 2], dtype=int64) - - >>> mi.get_locs([[True, False, True], slice('e', 'f')]) # doctest: +SKIP - array([2], dtype=int64) - """ - - # must be lexsorted to at least as many levels - true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] - if true_slices and true_slices[-1] >= self._lexsort_depth: - raise UnsortedIndexError( - "MultiIndex slicing requires the index to be lexsorted: slicing " - f"on levels {true_slices}, lexsort depth {self._lexsort_depth}" - ) - - n = len(self) - # indexer is the list of all positions that we want to take; it - # is created on the first entry in seq and narrowed down as we - # look at remaining entries - indexer = None - - if any(x is Ellipsis for x in seq): - raise NotImplementedError( - "MultiIndex does not support indexing with Ellipsis" - ) - - def _convert_to_indexer(r) -> Int64Index: - # return an indexer - if isinstance(r, slice): - m = np.zeros(n, dtype=bool) - m[r] = True - r = m.nonzero()[0] - elif com.is_bool_indexer(r): - if len(r) != n: - raise ValueError( - "cannot index with a boolean indexer " - "that is not the same length as the " - "index" - ) - r = r.nonzero()[0] - return Int64Index(r) - - def _update_indexer(idxr: Index, indexer: Index | None) -> Index: - if indexer is None: - return idxr - indexer_intersection = indexer.intersection(idxr) - if indexer_intersection.empty and not idxr.empty and not indexer.empty: - raise KeyError(seq) - return indexer_intersection - - for i, k in enumerate(seq): - - if com.is_bool_indexer(k): - # a boolean indexer, must be the same length! - k = np.asarray(k) - lvl_indexer = _convert_to_indexer(k) - indexer = _update_indexer(lvl_indexer, indexer=indexer) - - elif is_list_like(k): - # a collection of labels to include from this level (these - # are or'd) - - indexers: Int64Index | None = None - - # GH#27591 check if this is a single tuple key in the level - try: - # Argument "indexer" to "_get_level_indexer" of "MultiIndex" - # has incompatible type "Index"; expected "Optional[Int64Index]" - lev_loc = self._get_level_indexer( - k, level=i, indexer=indexer # type: ignore[arg-type] - ) - except (InvalidIndexError, TypeError, KeyError) as err: - # InvalidIndexError e.g. non-hashable, fall back to treating - # this as a sequence of labels - # KeyError it can be ambiguous if this is a label or sequence - # of labels - # github.com/pandas-dev/pandas/issues/39424#issuecomment-871626708 - for x in k: - if not is_hashable(x): - # e.g. slice - raise err - try: - # Argument "indexer" to "_get_level_indexer" of "MultiIndex" - # has incompatible type "Index"; expected - # "Optional[Int64Index]" - item_lvl_indexer = self._get_level_indexer( - x, level=i, indexer=indexer # type: ignore[arg-type] - ) - except KeyError: - # ignore not founds; see discussion in GH#39424 - warnings.warn( - "The behavior of indexing on a MultiIndex with a " - "nested sequence of labels is deprecated and will " - "change in a future version. " - "`series.loc[label, sequence]` will raise if any " - "members of 'sequence' or not present in " - "the index's second level. To retain the old " - "behavior, use `series.index.isin(sequence, level=1)`", - # TODO: how to opt in to the future behavior? - # TODO: how to handle IntervalIndex level? - # (no test cases) - FutureWarning, - stacklevel=find_stack_level(), - ) - continue - else: - idxrs = _convert_to_indexer(item_lvl_indexer) - - if indexers is None: - indexers = idxrs - else: - indexers = indexers.union(idxrs, sort=False) - - else: - idxrs = _convert_to_indexer(lev_loc) - if indexers is None: - indexers = idxrs - else: - indexers = indexers.union(idxrs, sort=False) - - if indexers is not None: - indexer = _update_indexer(indexers, indexer=indexer) - else: - # no matches we are done - # test_loc_getitem_duplicates_multiindex_empty_indexer - return np.array([], dtype=np.intp) - - elif com.is_null_slice(k): - # empty slice - if indexer is None: - indexer = Index(np.arange(n)) - - elif isinstance(k, slice): - - # a slice, include BOTH of the labels - # Argument "indexer" to "_get_level_indexer" of "MultiIndex" has - # incompatible type "Index"; expected "Optional[Int64Index]" - lvl_indexer = self._get_level_indexer( - k, - level=i, - indexer=indexer, # type: ignore[arg-type] - ) - indexer = _update_indexer( - _convert_to_indexer(lvl_indexer), - indexer=indexer, - ) - else: - # a single label - lvl_indexer = self._get_loc_level(k, level=i)[0] - indexer = _update_indexer( - _convert_to_indexer(lvl_indexer), - indexer=indexer, - ) - - # empty indexer - if indexer is None: - return np.array([], dtype=np.intp) - - assert isinstance(indexer, Int64Index), type(indexer) - indexer = self._reorder_indexer(seq, indexer) - - return indexer._values.astype(np.intp, copy=False) - - # -------------------------------------------------------------------- - - def _reorder_indexer( - self, - seq: tuple[Scalar | Iterable | AnyArrayLike, ...], - indexer: Int64Index, - ) -> Int64Index: - """ - Reorder an indexer of a MultiIndex (self) so that the label are in the - same order as given in seq - - Parameters - ---------- - seq : label/slice/list/mask or a sequence of such - indexer: an Int64Index indexer of self - - Returns - ------- - indexer : a sorted Int64Index indexer of self ordered as seq - """ - # If the index is lexsorted and the list_like label in seq are sorted - # then we do not need to sort - if self._is_lexsorted(): - need_sort = False - for i, k in enumerate(seq): - if is_list_like(k): - if not need_sort: - k_codes = self.levels[i].get_indexer(k) - k_codes = k_codes[k_codes >= 0] # Filter absent keys - # True if the given codes are not ordered - need_sort = (k_codes[:-1] > k_codes[1:]).any() - elif isinstance(k, slice) and k.step is not None and k.step < 0: - need_sort = True - # Bail out if both index and seq are sorted - if not need_sort: - return indexer - - n = len(self) - keys: tuple[np.ndarray, ...] = () - # For each level of the sequence in seq, map the level codes with the - # order they appears in a list-like sequence - # This mapping is then use to reorder the indexer - for i, k in enumerate(seq): - if is_scalar(k): - # GH#34603 we want to treat a scalar the same as an all equal list - k = [k] - if com.is_bool_indexer(k): - new_order = np.arange(n)[indexer] - elif is_list_like(k): - # Generate a map with all level codes as sorted initially - k = algos.unique(k) - key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len( - self.levels[i] - ) - # Set order as given in the indexer list - level_indexer = self.levels[i].get_indexer(k) - level_indexer = level_indexer[level_indexer >= 0] # Filter absent keys - key_order_map[level_indexer] = np.arange(len(level_indexer)) - - new_order = key_order_map[self.codes[i][indexer]] - elif isinstance(k, slice) and k.step is not None and k.step < 0: - # flip order for negative step - new_order = np.arange(n)[::-1][indexer] - elif isinstance(k, slice) and k.start is None and k.stop is None: - # slice(None) should not determine order GH#31330 - new_order = np.ones((n,))[indexer] - else: - # For all other case, use the same order as the level - new_order = np.arange(n)[indexer] - keys = (new_order,) + keys - - # Find the reordering using lexsort on the keys mapping - ind = np.lexsort(keys) - return indexer[ind] - - def truncate(self, before=None, after=None) -> MultiIndex: - """ - Slice index between two labels / tuples, return new MultiIndex - - Parameters - ---------- - before : label or tuple, can be partial. Default None - None defaults to start - after : label or tuple, can be partial. Default None - None defaults to end - - Returns - ------- - truncated : MultiIndex - """ - if after and before and after < before: - raise ValueError("after < before") - - i, j = self.levels[0].slice_locs(before, after) - left, right = self.slice_locs(before, after) - - new_levels = list(self.levels) - new_levels[0] = new_levels[0][i:j] - - new_codes = [level_codes[left:right] for level_codes in self.codes] - new_codes[0] = new_codes[0] - i - - return MultiIndex( - levels=new_levels, - codes=new_codes, - names=self._names, - verify_integrity=False, - ) - - def equals(self, other: object) -> bool: - """ - Determines if two MultiIndex objects have the same labeling information - (the levels themselves do not necessarily have to be the same) - - See Also - -------- - equal_levels - """ - if self.is_(other): - return True - - if not isinstance(other, Index): - return False - - if len(self) != len(other): - return False - - if not isinstance(other, MultiIndex): - # d-level MultiIndex can equal d-tuple Index - if not self._should_compare(other): - # object Index or Categorical[object] may contain tuples - return False - return array_equivalent(self._values, other._values) - - if self.nlevels != other.nlevels: - return False - - for i in range(self.nlevels): - self_codes = self.codes[i] - other_codes = other.codes[i] - self_mask = self_codes == -1 - other_mask = other_codes == -1 - if not np.array_equal(self_mask, other_mask): - return False - self_codes = self_codes[~self_mask] - self_values = self.levels[i]._values.take(self_codes) - - other_codes = other_codes[~other_mask] - other_values = other.levels[i]._values.take(other_codes) - - # since we use NaT both datetime64 and timedelta64 we can have a - # situation where a level is typed say timedelta64 in self (IOW it - # has other values than NaT) but types datetime64 in other (where - # its all NaT) but these are equivalent - if len(self_values) == 0 and len(other_values) == 0: - continue - - if not isinstance(self_values, np.ndarray): - # i.e. ExtensionArray - if not self_values.equals(other_values): - return False - elif not isinstance(other_values, np.ndarray): - # i.e. other is ExtensionArray - if not other_values.equals(self_values): - return False - else: - if not array_equivalent(self_values, other_values): - return False - - return True - - def equal_levels(self, other: MultiIndex) -> bool: - """ - Return True if the levels of both MultiIndex objects are the same - - """ - if self.nlevels != other.nlevels: - return False - - for i in range(self.nlevels): - if not self.levels[i].equals(other.levels[i]): - return False - return True - - # -------------------------------------------------------------------- - # Set Methods - - def _union(self, other, sort) -> MultiIndex: - other, result_names = self._convert_can_do_setop(other) - if ( - any(-1 in code for code in self.codes) - and any(-1 in code for code in other.codes) - or self.has_duplicates - or other.has_duplicates - ): - # This is only necessary if both sides have nans or one has dups, - # fast_unique_multiple is faster - result = super()._union(other, sort) - else: - rvals = other._values.astype(object, copy=False) - result = lib.fast_unique_multiple([self._values, rvals], sort=sort) - - return MultiIndex.from_arrays(zip(*result), sortorder=None, names=result_names) - - def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: - return is_object_dtype(dtype) - - def _get_reconciled_name_object(self, other) -> MultiIndex: - """ - If the result of a set operation will be self, - return self, unless the names change, in which - case make a shallow copy of self. - """ - names = self._maybe_match_names(other) - if self.names != names: - # Incompatible return value type (got "Optional[MultiIndex]", expected - # "MultiIndex") - return self.rename(names) # type: ignore[return-value] - return self - - def _maybe_match_names(self, other): - """ - Try to find common names to attach to the result of an operation between - a and b. Return a consensus list of names if they match at least partly - or list of None if they have completely different names. - """ - if len(self.names) != len(other.names): - return [None] * len(self.names) - names = [] - for a_name, b_name in zip(self.names, other.names): - if a_name == b_name: - names.append(a_name) - else: - # TODO: what if they both have np.nan for their names? - names.append(None) - return names - - def _wrap_intersection_result(self, other, result) -> MultiIndex: - _, result_names = self._convert_can_do_setop(other) - - if len(result) == 0: - return MultiIndex( - levels=self.levels, - codes=[[]] * self.nlevels, - names=result_names, - verify_integrity=False, - ) - else: - return MultiIndex.from_arrays(zip(*result), sortorder=0, names=result_names) - - def _wrap_difference_result(self, other, result) -> MultiIndex: - _, result_names = self._convert_can_do_setop(other) - - if len(result) == 0: - return MultiIndex( - levels=[[]] * self.nlevels, - codes=[[]] * self.nlevels, - names=result_names, - verify_integrity=False, - ) - else: - return MultiIndex.from_tuples(result, sortorder=0, names=result_names) - - def _convert_can_do_setop(self, other): - result_names = self.names - - if not isinstance(other, Index): - - if len(other) == 0: - return self[:0], self.names - else: - msg = "other must be a MultiIndex or a list of tuples" - try: - other = MultiIndex.from_tuples(other, names=self.names) - except (ValueError, TypeError) as err: - # ValueError raised by tuples_to_object_array if we - # have non-object dtype - raise TypeError(msg) from err - else: - result_names = get_unanimous_names(self, other) - - return other, result_names - - # -------------------------------------------------------------------- - - @doc(Index.astype) - def astype(self, dtype, copy: bool = True): - dtype = pandas_dtype(dtype) - if is_categorical_dtype(dtype): - msg = "> 1 ndim Categorical are not supported at this time" - raise NotImplementedError(msg) - elif not is_object_dtype(dtype): - raise TypeError( - "Setting a MultiIndex dtype to anything other than object " - "is not supported" - ) - elif copy is True: - return self._view() - return self - - def _validate_fill_value(self, item): - if isinstance(item, MultiIndex): - # GH#43212 - if item.nlevels != self.nlevels: - raise ValueError("Item must have length equal to number of levels.") - return item._values - elif not isinstance(item, tuple): - # Pad the key with empty strings if lower levels of the key - # aren't specified: - item = (item,) + ("",) * (self.nlevels - 1) - elif len(item) != self.nlevels: - raise ValueError("Item must have length equal to number of levels.") - return item - - def insert(self, loc: int, item) -> MultiIndex: - """ - Make new MultiIndex inserting new item at location - - Parameters - ---------- - loc : int - item : tuple - Must be same length as number of levels in the MultiIndex - - Returns - ------- - new_index : Index - """ - item = self._validate_fill_value(item) - - new_levels = [] - new_codes = [] - for k, level, level_codes in zip(item, self.levels, self.codes): - if k not in level: - # have to insert into level - # must insert at end otherwise you have to recompute all the - # other codes - lev_loc = len(level) - level = level.insert(lev_loc, k) - else: - lev_loc = level.get_loc(k) - - new_levels.append(level) - new_codes.append(np.insert(ensure_int64(level_codes), loc, lev_loc)) - - return MultiIndex( - levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False - ) - - def delete(self, loc) -> MultiIndex: - """ - Make new index with passed location deleted - - Returns - ------- - new_index : MultiIndex - """ - new_codes = [np.delete(level_codes, loc) for level_codes in self.codes] - return MultiIndex( - levels=self.levels, - codes=new_codes, - names=self.names, - verify_integrity=False, - ) - - @doc(Index.isin) - def isin(self, values, level=None) -> npt.NDArray[np.bool_]: - if level is None: - values = MultiIndex.from_tuples(values, names=self.names)._values - return algos.isin(self._values, values) + if to_strip is None: + result = pc.utf8_rtrim_whitespace(self._data) else: - num = self._get_level_number(level) - levs = self.get_level_values(num) - - if levs.size == 0: - return np.zeros(len(levs), dtype=np.bool_) - return levs.isin(values) - - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "names"]) - def set_names(self, names, level=None, inplace: bool = False) -> MultiIndex | None: - return super().set_names(names=names, level=level, inplace=inplace) - - rename = set_names - - @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) - def drop_duplicates(self, keep: str | bool = "first") -> MultiIndex: - return super().drop_duplicates(keep=keep) - - # --------------------------------------------------------------- - # Arithmetic/Numeric Methods - Disabled - - __add__ = make_invalid_op("__add__") - __radd__ = make_invalid_op("__radd__") - __iadd__ = make_invalid_op("__iadd__") - __sub__ = make_invalid_op("__sub__") - __rsub__ = make_invalid_op("__rsub__") - __isub__ = make_invalid_op("__isub__") - __pow__ = make_invalid_op("__pow__") - __rpow__ = make_invalid_op("__rpow__") - __mul__ = make_invalid_op("__mul__") - __rmul__ = make_invalid_op("__rmul__") - __floordiv__ = make_invalid_op("__floordiv__") - __rfloordiv__ = make_invalid_op("__rfloordiv__") - __truediv__ = make_invalid_op("__truediv__") - __rtruediv__ = make_invalid_op("__rtruediv__") - __mod__ = make_invalid_op("__mod__") - __rmod__ = make_invalid_op("__rmod__") - __divmod__ = make_invalid_op("__divmod__") - __rdivmod__ = make_invalid_op("__rdivmod__") - # Unary methods disabled - __neg__ = make_invalid_op("__neg__") - __pos__ = make_invalid_op("__pos__") - __abs__ = make_invalid_op("__abs__") - __invert__ = make_invalid_op("__invert__") - - -def _lexsort_depth(codes: list[np.ndarray], nlevels: int) -> int: - """Count depth (up to a maximum of `nlevels`) with which codes are lexsorted.""" - int64_codes = [ensure_int64(level_codes) for level_codes in codes] - for k in range(nlevels, 0, -1): - if libalgos.is_lexsorted(int64_codes[:k]): - return k - return 0 - - -def sparsify_labels(label_list, start: int = 0, sentinel=""): - pivoted = list(zip(*label_list)) - k = len(label_list) - - result = pivoted[: start + 1] - prev = pivoted[start] - - for cur in pivoted[start + 1 :]: - sparse_cur = [] - - for i, (p, t) in enumerate(zip(prev, cur)): - if i == k - 1: - sparse_cur.append(t) - result.append(sparse_cur) - break - - if p == t: - sparse_cur.append(sentinel) - else: - sparse_cur.extend(cur[i:]) - result.append(sparse_cur) - break - - prev = cur - - return list(zip(*result)) - - -def _get_na_rep(dtype) -> str: - return {np.datetime64: "NaT", np.timedelta64: "NaT"}.get(dtype, "NaN") - - -def maybe_droplevels(index: Index, key) -> Index: - """ - Attempt to drop level or levels from the given index. - - Parameters - ---------- - index: Index - key : scalar or tuple - - Returns - ------- - Index - """ - # drop levels - original_index = index - if isinstance(key, tuple): - for _ in key: - try: - index = index._drop_level_numbers([0]) - except ValueError: - # we have dropped too much, so back out - return original_index - else: - try: - index = index._drop_level_numbers([0]) - except ValueError: - pass - - return index - - -def _coerce_indexer_frozen(array_like, categories, copy: bool = False) -> np.ndarray: - """ - Coerce the array-like indexer to the smallest integer dtype that can encode all - of the given categories. - - Parameters - ---------- - array_like : array-like - categories : array-like - copy : bool - - Returns - ------- - np.ndarray - Non-writeable. - """ - array_like = coerce_indexer_dtype(array_like, categories) - if copy: - array_like = array_like.copy() - array_like.flags.writeable = False - return array_like - - -def _require_listlike(level, arr, arrname: str): - """ - Ensure that level is either None or listlike, and arr is list-of-listlike. - """ - if level is not None and not is_list_like(level): - if not is_list_like(arr): - raise TypeError(f"{arrname} must be list-like") - if is_list_like(arr[0]): - raise TypeError(f"{arrname} must be list-like") - level = [level] - arr = [arr] - elif level is None or is_list_like(level): - if not is_list_like(arr) or not is_list_like(arr[0]): - raise TypeError(f"{arrname} must be list of lists-like") - return level, arr + result = pc.utf8_rtrim(self._data, characters=to_strip) + return type(self)(result) From 4a329fd2d7876c7088d356adb5c47ce67d01dea4 Mon Sep 17 00:00:00 2001 From: Khor Chean Wei Date: Tue, 15 Mar 2022 22:27:12 +0800 Subject: [PATCH 17/39] Update multi.py --- pandas/core/indexes/multi.py | 4428 +++++++++++++++++++++++++++++----- 1 file changed, 3812 insertions(+), 616 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e39ebd3afd2ff..538baaee1b961 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,806 +1,4002 @@ from __future__ import annotations -from collections.abc import Callable # noqa: PDF001 -import re +from functools import wraps +from sys import getsizeof from typing import ( TYPE_CHECKING, Any, - Union, + Callable, + Collection, + Hashable, + Iterable, + List, + Sequence, + Tuple, cast, - overload, ) +import warnings import numpy as np +from pandas._config import get_option + from pandas._libs import ( + algos as libalgos, + index as libindex, lib, - missing as libmissing, ) +from pandas._libs.hashtable import duplicated from pandas._typing import ( - Dtype, - NpDtype, - PositionalIndexer, + AnyArrayLike, + DtypeObj, + F, Scalar, - ScalarIndexer, - SequenceIndexer, - TakeIndexer, + Shape, npt, ) -from pandas.compat import ( - pa_version_under1p01, - pa_version_under2p0, - pa_version_under3p0, - pa_version_under4p0, +from pandas.compat.numpy import function as nv +from pandas.errors import ( + InvalidIndexError, + PerformanceWarning, + UnsortedIndexError, +) +from pandas.util._decorators import ( + Appender, + cache_readonly, + deprecate_nonkeyword_arguments, + doc, ) -from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.cast import coerce_indexer_dtype from pandas.core.dtypes.common import ( - is_array_like, - is_bool_dtype, - is_dtype_equal, + ensure_int64, + ensure_platform_int, + is_categorical_dtype, + is_hashable, is_integer, - is_integer_dtype, + is_iterator, + is_list_like, is_object_dtype, is_scalar, - is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.missing import isna - -from pandas.core.arraylike import OpsMixin -from pandas.core.arrays._mixins import ArrowExtensionArray -from pandas.core.arrays.base import ExtensionArray -from pandas.core.arrays.boolean import BooleanDtype -from pandas.core.arrays.integer import Int64Dtype -from pandas.core.arrays.numeric import NumericDtype -from pandas.core.arrays.string_ import ( - BaseStringArray, - StringDtype, +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCDatetimeIndex, + ABCTimedeltaIndex, ) -from pandas.core.indexers import ( - check_array_indexer, - unpack_tuple_and_ellipses, - validate_indices, +from pandas.core.dtypes.missing import ( + array_equivalent, + isna, ) -from pandas.core.strings.object_array import ObjectStringArrayMixin -if not pa_version_under1p01: - import pyarrow as pa - import pyarrow.compute as pc - - ARROW_CMP_FUNCS = { - "eq": pc.equal, - "ne": pc.not_equal, - "lt": pc.less, - "gt": pc.greater, - "le": pc.less_equal, - "ge": pc.greater_equal, - } +import pandas.core.algorithms as algos +from pandas.core.arrays import Categorical +from pandas.core.arrays.categorical import factorize_from_iterables +import pandas.core.common as com +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import ( + Index, + _index_shared_docs, + ensure_index, + get_unanimous_names, +) +from pandas.core.indexes.frozen import FrozenList +from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops.invalid import make_invalid_op +from pandas.core.sorting import ( + get_group_index, + indexer_from_factorized, + lexsort_indexer, +) +from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: - from pandas import Series + from pandas import ( + CategoricalIndex, + DataFrame, + Series, + ) + +_index_doc_kwargs = dict(ibase._index_doc_kwargs) +_index_doc_kwargs.update( + {"klass": "MultiIndex", "target_klass": "MultiIndex or list of tuples"} +) + + +class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): + """ + This class manages a MultiIndex by mapping label combinations to positive + integers. + """ + + _base = libindex.UInt64Engine + + def _codes_to_ints(self, codes): + """ + Transform combination(s) of uint64 in one uint64 (each), in a strictly + monotonic way (i.e. respecting the lexicographic order of integer + combinations): see BaseMultiIndexCodesEngine documentation. + + Parameters + ---------- + codes : 1- or 2-dimensional array of dtype uint64 + Combinations of integers (one per row) -ArrowStringScalarOrNAT = Union[str, libmissing.NAType] + Returns + ------- + scalar or 1-dimensional array, of dtype uint64 + Integer(s) representing one combination (each). + """ + # Shift the representation of each level by the pre-calculated number + # of bits: + codes <<= self.offsets + + # Now sum and OR are in fact interchangeable. This is a simple + # composition of the (disjunct) significant bits of each level (i.e. + # each column in "codes") in a single positive integer: + if codes.ndim == 1: + # Single key + return np.bitwise_or.reduce(codes) + + # Multiple keys + return np.bitwise_or.reduce(codes, axis=1) + + +class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine): + """ + This class manages those (extreme) cases in which the number of possible + label combinations overflows the 64 bits integers, and uses an ObjectEngine + containing Python integers. + """ + + _base = libindex.ObjectEngine + + def _codes_to_ints(self, codes): + """ + Transform combination(s) of uint64 in one Python integer (each), in a + strictly monotonic way (i.e. respecting the lexicographic order of + integer combinations): see BaseMultiIndexCodesEngine documentation. + Parameters + ---------- + codes : 1- or 2-dimensional array of dtype uint64 + Combinations of integers (one per row) + + Returns + ------- + int, or 1-dimensional array of dtype object + Integer(s) representing one combination (each). + """ + # Shift the representation of each level by the pre-calculated number + # of bits. Since this can overflow uint64, first make sure we are + # working with Python integers: + codes = codes.astype("object") << self.offsets -def _chk_pyarrow_available() -> None: - if pa_version_under1p01: - msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray." - raise ImportError(msg) + # Now sum and OR are in fact interchangeable. This is a simple + # composition of the (disjunct) significant bits of each level (i.e. + # each column in "codes") in a single positive integer (per row): + if codes.ndim == 1: + # Single key + return np.bitwise_or.reduce(codes) + # Multiple keys + return np.bitwise_or.reduce(codes, axis=1) -# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from -# ObjectStringArrayMixin because we want to have the object-dtype based methods as -# fallback for the ones that pyarrow doesn't yet support +def names_compat(meth: F) -> F: + """ + A decorator to allow either `name` or `names` keyword but not both. -class ArrowStringArray( - OpsMixin, ArrowExtensionArray, BaseStringArray, ObjectStringArrayMixin -): + This makes it easier to share code with base class. """ - Extension array for string data in a ``pyarrow.ChunkedArray``. - .. versionadded:: 1.2.0 + @wraps(meth) + def new_meth(self_or_cls, *args, **kwargs): + if "name" in kwargs and "names" in kwargs: + raise TypeError("Can only provide one of `names` and `name`") + elif "name" in kwargs: + kwargs["names"] = kwargs.pop("name") + + return meth(self_or_cls, *args, **kwargs) - .. warning:: + return cast(F, new_meth) - ArrowStringArray is considered experimental. The implementation and - parts of the API may change without warning. + +class MultiIndex(Index): + """ + A multi-level, or hierarchical, index object for pandas objects. Parameters ---------- - values : pyarrow.Array or pyarrow.ChunkedArray - The array of data. + levels : sequence of arrays + The unique labels for each level. + codes : sequence of arrays + Integers for each level designating which label at each location. + sortorder : optional int + Level of sortedness (must be lexicographically sorted by that + level). + names : optional sequence of objects + Names for each of the index levels. (name is accepted for compat). + copy : bool, default False + Copy the meta-data. + verify_integrity : bool, default True + Check that the levels/codes are consistent and valid. Attributes ---------- - None + names + levels + codes + nlevels + levshape Methods ------- - None + from_arrays + from_tuples + from_product + from_frame + set_levels + set_codes + to_frame + to_flat_index + sortlevel + droplevel + swaplevel + reorder_levels + remove_unused_levels + get_locs See Also -------- - array - The recommended function for creating a ArrowStringArray. - Series.str - The string methods are available on Series backed by - a ArrowStringArray. + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_product : Create a MultiIndex from the cartesian product + of iterables. + MultiIndex.from_tuples : Convert list of tuples to a MultiIndex. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + Index : The base pandas Index type. Notes ----- - ArrowStringArray returns a BooleanArray for comparison methods. + See the `user guide + `__ + for more. Examples -------- - >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]") - - ['This is', 'some text', , 'data.'] - Length: 4, dtype: string + A new ``MultiIndex`` is typically constructed using one of the helper + methods :meth:`MultiIndex.from_arrays`, :meth:`MultiIndex.from_product` + and :meth:`MultiIndex.from_tuples`. For example (using ``.from_arrays``): + + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + + See further examples for how to construct a MultiIndex in the doc strings + of the mentioned helper methods. """ - def __init__(self, values) -> None: - self._dtype = StringDtype(storage="pyarrow") - if isinstance(values, pa.Array): - self._data = pa.chunked_array([values]) - elif isinstance(values, pa.ChunkedArray): - self._data = values - else: - raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") - - if not pa.types.is_string(self._data.type): - raise ValueError( - "ArrowStringArray requires a PyArrow (chunked) array of string type" - ) + _hidden_attrs = Index._hidden_attrs | frozenset() + + # initialize to zero-length tuples to make everything work + _typ = "multiindex" + _names = FrozenList() + _levels = FrozenList() + _codes = FrozenList() + _comparables = ["names"] + + sortorder: int | None + + # -------------------------------------------------------------------- + # Constructors + + def __new__( + cls, + levels=None, + codes=None, + sortorder=None, + names=None, + dtype=None, + copy=False, + name=None, + verify_integrity: bool = True, + ): - @classmethod - def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): - from pandas.core.arrays.masked import BaseMaskedArray + # compat with Index + if name is not None: + names = name + if levels is None or codes is None: + raise TypeError("Must pass both levels and codes") + if len(levels) != len(codes): + raise ValueError("Length of levels and codes must be the same.") + if len(levels) == 0: + raise ValueError("Must pass non-zero number of levels/codes") + + result = object.__new__(cls) + result._cache = {} + + # we've already validated levels and codes, so shortcut here + result._set_levels(levels, copy=copy, validate=False) + result._set_codes(codes, copy=copy, validate=False) + + # Incompatible types in assignment (expression has type "List[None]", + # variable has type "FrozenList") [assignment] + result._names = [None] * len(levels) # type: ignore[assignment] + if names is not None: + # handles name validation + result._set_names(names) + + if sortorder is not None: + result.sortorder = int(sortorder) + else: + result.sortorder = sortorder - _chk_pyarrow_available() + if verify_integrity: + new_codes = result._verify_integrity() + result._codes = new_codes - if dtype and not (isinstance(dtype, str) and dtype == "string"): - dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" + result._reset_identity() - if isinstance(scalars, BaseMaskedArray): - # avoid costly conversion to object dtype in ensure_string_array and - # numerical issues with Float32Dtype - na_values = scalars._mask - result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - return cls(pa.array(result, mask=na_values, type=pa.string())) + return result - # convert non-na-likes to str - result = lib.ensure_string_array(scalars, copy=copy) - return cls(pa.array(result, type=pa.string(), from_pandas=True)) + def _validate_codes(self, level: list, code: list): + """ + Reassign code values as -1 if their corresponding levels are NaN. - @classmethod - def _from_sequence_of_strings( - cls, strings, dtype: Dtype | None = None, copy: bool = False - ): - return cls._from_sequence(strings, dtype=dtype, copy=copy) + Parameters + ---------- + code : list + Code to reassign. + level : list + Level to check for missing values (NaN, NaT, None). - @property - def dtype(self) -> StringDtype: + Returns + ------- + new code where code value = -1 if it corresponds + to a level with missing values (NaN, NaT, None). """ - An instance of 'string[pyarrow]'. + null_mask = isna(level) + if np.any(null_mask): + # Incompatible types in assignment (expression has type + # "ndarray[Any, dtype[Any]]", variable has type "List[Any]") + code = np.where(null_mask[code], -1, code) # type: ignore[assignment] + return code + + def _verify_integrity(self, codes: list | None = None, levels: list | None = None): """ - return self._dtype + Parameters + ---------- + codes : optional list + Codes to check for validity. Defaults to current codes. + levels : optional list + Levels to check for validity. Defaults to current levels. - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: - """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.to_numpy(dtype=dtype) + Raises + ------ + ValueError + If length of levels and codes don't match, if the codes for any + level would exceed level bounds, or there are any duplicate levels. - def to_numpy( - self, - dtype: npt.DTypeLike | None = None, - copy: bool = False, - na_value=lib.no_default, - ) -> np.ndarray: - """ - Convert to a NumPy ndarray. + Returns + ------- + new codes where code value = -1 if it corresponds to a + NaN level. """ - # TODO: copy argument is ignored + # NOTE: Currently does not check, among other things, that cached + # nlevels matches nor that sortorder matches actually sortorder. + codes = codes or self.codes + levels = levels or self.levels - result = np.array(self._data, dtype=dtype) - if self._data.null_count > 0: - if na_value is lib.no_default: - if dtype and np.issubdtype(dtype, np.floating): - return result - na_value = self._dtype.na_value - mask = self.isna() - result[mask] = na_value - return result + if len(levels) != len(codes): + raise ValueError( + "Length of levels and codes must match. NOTE: " + "this index is in an inconsistent state." + ) + codes_length = len(codes[0]) + for i, (level, level_codes) in enumerate(zip(levels, codes)): + if len(level_codes) != codes_length: + raise ValueError( + f"Unequal code lengths: {[len(code_) for code_ in codes]}" + ) + if len(level_codes) and level_codes.max() >= len(level): + raise ValueError( + f"On level {i}, code max ({level_codes.max()}) >= length of " + f"level ({len(level)}). NOTE: this index is in an " + "inconsistent state" + ) + if len(level_codes) and level_codes.min() < -1: + raise ValueError(f"On level {i}, code value ({level_codes.min()}) < -1") + if not level.is_unique: + raise ValueError( + f"Level values must be unique: {list(level)} on level {i}" + ) + if self.sortorder is not None: + if self.sortorder > _lexsort_depth(self.codes, self.nlevels): + raise ValueError( + "Value for sortorder must be inferior or equal to actual " + f"lexsort_depth: sortorder {self.sortorder} " + f"with lexsort_depth {_lexsort_depth(self.codes, self.nlevels)}" + ) - @doc(ExtensionArray.factorize) - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: - encoded = self._data.dictionary_encode() - indices = pa.chunked_array( - [c.indices for c in encoded.chunks], type=encoded.type.index_type - ).to_pandas() - if indices.dtype.kind == "f": - indices[np.isnan(indices)] = na_sentinel - indices = indices.astype(np.int64, copy=False) - - if encoded.num_chunks: - uniques = type(self)(encoded.chunk(0).dictionary) - else: - uniques = type(self)(pa.array([], type=encoded.type.value_type)) + codes = [ + self._validate_codes(level, code) for level, code in zip(levels, codes) + ] + new_codes = FrozenList(codes) + return new_codes + + @classmethod + def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> MultiIndex: + """ + Convert arrays to MultiIndex. - return indices.values, uniques + Parameters + ---------- + arrays : list / sequence of array-likes + Each array-like gives one level's value for each data point. + len(arrays) is the number of levels. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). + names : list / sequence of str, optional + Names for the levels in the index. + + Returns + ------- + MultiIndex - @overload - def __getitem__(self, item: ScalarIndexer) -> ArrowStringScalarOrNAT: - ... + See Also + -------- + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. - @overload - def __getitem__(self: ArrowStringArray, item: SequenceIndexer) -> ArrowStringArray: - ... + Examples + -------- + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + """ + error_msg = "Input must be a list / sequence of array-likes." + if not is_list_like(arrays): + raise TypeError(error_msg) + elif is_iterator(arrays): + arrays = list(arrays) + + # Check if elements of array are list-like + for array in arrays: + if not is_list_like(array): + raise TypeError(error_msg) + + # Check if lengths of all arrays are equal or not, + # raise ValueError, if not + for i in range(1, len(arrays)): + if len(arrays[i]) != len(arrays[i - 1]): + raise ValueError("all arrays must be same length") + + codes, levels = factorize_from_iterables(arrays) + + if all(isinstance(e, tuple) for e in arrays): + codes = [np.array([i for i in range(len(arrays))])] + _dtype_obj = np.dtype("object") + subarr = com.asarray_tuplesafe(arrays, dtype=_dtype_obj) + levels = [Index(subarr)] + + if names is lib.no_default: + names = [getattr(arr, "name", None) for arr in arrays] + + return cls( + levels=levels, + codes=codes, + sortorder=sortorder, + names=names, + verify_integrity=False, + ) - def __getitem__( - self: ArrowStringArray, item: PositionalIndexer - ) -> ArrowStringArray | ArrowStringScalarOrNAT: - """Select a subset of self. + @classmethod + @names_compat + def from_tuples( + cls, + tuples: Iterable[tuple[Hashable, ...]], + sortorder: int | None = None, + names: Sequence[Hashable] | None = None, + ) -> MultiIndex: + """ + Convert list of tuples to MultiIndex. Parameters ---------- - item : int, slice, or ndarray - * int: The position in 'self' to get. - * slice: A slice object, where 'start', 'stop', and 'step' are - integers or None - * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + tuples : list / sequence of tuple-likes + Each tuple is the index of one row/column. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). + names : list / sequence of str, optional + Names for the levels in the index. Returns ------- - item : scalar or ExtensionArray - - Notes - ----- - For scalar ``item``, return a scalar value suitable for the array's - type. This should be an instance of ``self.dtype.type``. - For slice ``key``, return an instance of ``ExtensionArray``, even - if the slice is length 0 or 1. - For a boolean mask, return an instance of ``ExtensionArray``, filtered - to the values where ``item`` is True. - """ - item = check_array_indexer(self, item) - - if isinstance(item, np.ndarray): - if not len(item): - return type(self)(pa.chunked_array([], type=pa.string())) - elif is_integer_dtype(item.dtype): - return self.take(item) - elif is_bool_dtype(item.dtype): - return type(self)(self._data.filter(item)) - else: - raise IndexError( - "Only integers, slices and integer or " - "boolean arrays are valid indices." - ) - elif isinstance(item, tuple): - item = unpack_tuple_and_ellipses(item) - - # error: Non-overlapping identity check (left operand type: - # "Union[Union[int, integer[Any]], Union[slice, List[int], - # ndarray[Any, Any]]]", right operand type: "ellipsis") - if item is Ellipsis: # type: ignore[comparison-overlap] - # TODO: should be handled by pyarrow? - item = slice(None) - - if is_scalar(item) and not is_integer(item): - # e.g. "foo" or 2.5 - # exception message copied from numpy - raise IndexError( - r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " - r"(`None`) and integer or boolean arrays are valid indices" - ) - # We are not an array indexer, so maybe e.g. a slice or integer - # indexer. We dispatch to pyarrow. - value = self._data[item] - if isinstance(value, pa.ChunkedArray): - return type(self)(value) - else: - return self._as_pandas_scalar(value) + MultiIndex - def _as_pandas_scalar(self, arrow_scalar: pa.Scalar): - scalar = arrow_scalar.as_py() - if scalar is None: - return self._dtype.na_value - else: - return scalar - - def _cmp_method(self, other, op): - from pandas.arrays import BooleanArray + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. - pc_func = ARROW_CMP_FUNCS[op.__name__] - if isinstance(other, ArrowStringArray): - result = pc_func(self._data, other._data) - elif isinstance(other, (np.ndarray, list)): - result = pc_func(self._data, other) - elif is_scalar(other): - try: - result = pc_func(self._data, pa.scalar(other)) - except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): - mask = isna(self) | isna(other) - valid = ~mask - result = np.zeros(len(self), dtype="bool") - result[valid] = op(np.array(self)[valid], other) - return BooleanArray(result, mask) + Examples + -------- + >>> tuples = [(1, 'red'), (1, 'blue'), + ... (2, 'red'), (2, 'blue')] + >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + """ + if not is_list_like(tuples): + raise TypeError("Input must be a list / sequence of tuple-likes.") + elif is_iterator(tuples): + tuples = list(tuples) + tuples = cast(Collection[Tuple[Hashable, ...]], tuples) + + arrays: list[Sequence[Hashable]] + if len(tuples) == 0: + if names is None: + raise TypeError("Cannot infer number of levels from empty list") + arrays = [[]] * len(names) + elif isinstance(tuples, (np.ndarray, Index)): + if isinstance(tuples, Index): + tuples = np.asarray(tuples._values) + + arrays = list(lib.tuples_to_object_array(tuples).T) + elif isinstance(tuples, list): + arrays = list(lib.to_object_array_tuples(tuples).T) else: - return NotImplemented + arrs = zip(*tuples) + arrays = cast(List[Sequence[Hashable]], arrs) - # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray - return BooleanArray._from_sequence(result.to_pandas().values) + if all(isinstance(e, tuple) for e in tuples): + if not all(tuples): + return cls.from_arrays(tuples, sortorder=sortorder, names=names) - def insert(self, loc: int, item): - if not isinstance(item, str) and item is not libmissing.NA: - raise TypeError("Scalar must be NA or str") - return super().insert(loc, item) + return cls.from_arrays(arrays, sortorder=sortorder, names=names) - def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: - """Set one or more values inplace. + @classmethod + def from_product( + cls, iterables, sortorder=None, names=lib.no_default + ) -> MultiIndex: + """ + Make a MultiIndex from the cartesian product of multiple iterables. Parameters ---------- - key : int, ndarray, or slice - When called from, e.g. ``Series.__setitem__``, ``key`` will be - one of + iterables : list / sequence of iterables + Each iterable has unique labels for each level of the index. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). + names : list / sequence of str, optional + Names for the levels in the index. - * scalar int - * ndarray of integers. - * boolean ndarray - * slice object + .. versionchanged:: 1.0.0 - value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object - value or values to be set of ``key``. + If not explicitly provided, names will be inferred from the + elements of iterables if an element has a name attribute Returns ------- - None - """ - key = check_array_indexer(self, key) + MultiIndex - if is_integer(key): - key = cast(int, key) - - if not is_scalar(value): - raise ValueError("Must pass scalars with scalar indexer") - elif isna(value): - value = None - elif not isinstance(value, str): - raise ValueError("Scalar must be NA or str") + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. - # Slice data and insert in-between - new_data = [ - *self._data[0:key].chunks, - pa.array([value], type=pa.string()), - *self._data[(key + 1) :].chunks, - ] - self._data = pa.chunked_array(new_data) - else: - # Convert to integer indices and iteratively assign. - # TODO: Make a faster variant of this in Arrow upstream. - # This is probably extremely slow. - - # Convert all possible input key types to an array of integers - if isinstance(key, slice): - key_array = np.array(range(len(self))[key]) - elif is_bool_dtype(key): - # TODO(ARROW-9430): Directly support setitem(booleans) - key_array = np.argwhere(key).flatten() - else: - # TODO(ARROW-9431): Directly support setitem(integers) - key_array = np.asanyarray(key) + Examples + -------- + >>> numbers = [0, 1, 2] + >>> colors = ['green', 'purple'] + >>> pd.MultiIndex.from_product([numbers, colors], + ... names=['number', 'color']) + MultiIndex([(0, 'green'), + (0, 'purple'), + (1, 'green'), + (1, 'purple'), + (2, 'green'), + (2, 'purple')], + names=['number', 'color']) + """ + from pandas.core.reshape.util import cartesian_product - if is_scalar(value): - value = np.broadcast_to(value, len(key_array)) - else: - value = np.asarray(value) + if not is_list_like(iterables): + raise TypeError("Input must be a list / sequence of iterables.") + elif is_iterator(iterables): + iterables = list(iterables) - if len(key_array) != len(value): - raise ValueError("Length of indexer and values mismatch") + codes, levels = factorize_from_iterables(iterables) + if names is lib.no_default: + names = [getattr(it, "name", None) for it in iterables] - for k, v in zip(key_array, value): - self[k] = v + # codes are all ndarrays, so cartesian_product is lossless + codes = cartesian_product(codes) + return cls(levels, codes, sortorder=sortorder, names=names) - def take( - self, - indices: TakeIndexer, - allow_fill: bool = False, - fill_value: Any = None, - ): + @classmethod + def from_frame(cls, df: DataFrame, sortorder=None, names=None) -> MultiIndex: """ - Take elements from an array. + Make a MultiIndex from a DataFrame. Parameters ---------- - indices : sequence of int or one-dimensional np.ndarray of int - Indices to be taken. - allow_fill : bool, default False - How to handle negative values in `indices`. + df : DataFrame + DataFrame to be converted to MultiIndex. + sortorder : int, optional + Level of sortedness (must be lexicographically sorted by that + level). + names : list-like, optional + If no names are provided, use the column names, or tuple of column + names if the columns is a MultiIndex. If a sequence, overwrite + names with the given sequence. - * False: negative values in `indices` indicate positional indices - from the right (the default). This is similar to - :func:`numpy.take`. + Returns + ------- + MultiIndex + The MultiIndex representation of the given DataFrame. - * True: negative values in `indices` indicate - missing values. These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. - fill_value : any, optional - Fill value to use for NA-indices when `allow_fill` is True. - This may be ``None``, in which case the default NA value for - the type, ``self.dtype.na_value``, is used. + Examples + -------- + >>> df = pd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], + ... ['NJ', 'Temp'], ['NJ', 'Precip']], + ... columns=['a', 'b']) + >>> df + a b + 0 HI Temp + 1 HI Precip + 2 NJ Temp + 3 NJ Precip + + >>> pd.MultiIndex.from_frame(df) + MultiIndex([('HI', 'Temp'), + ('HI', 'Precip'), + ('NJ', 'Temp'), + ('NJ', 'Precip')], + names=['a', 'b']) + + Using explicit names, instead of the column names + + >>> pd.MultiIndex.from_frame(df, names=['state', 'observation']) + MultiIndex([('HI', 'Temp'), + ('HI', 'Precip'), + ('NJ', 'Temp'), + ('NJ', 'Precip')], + names=['state', 'observation']) + """ + if not isinstance(df, ABCDataFrame): + raise TypeError("Input must be a DataFrame") + + column_names, columns = zip(*df.items()) + names = column_names if names is None else names + return cls.from_arrays(columns, sortorder=sortorder, names=names) + + # -------------------------------------------------------------------- + + @cache_readonly + def _values(self) -> np.ndarray: + # We override here, since our parent uses _data, which we don't use. + values = [] + + for i in range(self.nlevels): + vals = self._get_level_values(i) + if is_categorical_dtype(vals.dtype): + vals = cast("CategoricalIndex", vals) + vals = vals._data._internal_get_values() + if isinstance(vals.dtype, ExtensionDtype) or isinstance( + vals, (ABCDatetimeIndex, ABCTimedeltaIndex) + ): + vals = vals.astype(object) + # error: Incompatible types in assignment (expression has type "ndarray", + # variable has type "Index") + vals = np.array(vals, copy=False) # type: ignore[assignment] + values.append(vals) + + arr = lib.fast_zip(values) + return arr - For many ExtensionArrays, there will be two representations of - `fill_value`: a user-facing "boxed" scalar, and a low-level - physical NA value. `fill_value` should be the user-facing version, - and the implementation should handle translating that to the - physical version for processing the take if necessary. + @property + def values(self) -> np.ndarray: + return self._values - Returns - ------- - ExtensionArray + @property + def array(self): + """ + Raises a ValueError for `MultiIndex` because there's no single + array backing a MultiIndex. Raises ------ - IndexError - When the indices are out of bounds for the array. ValueError - When `indices` contains negative values other than ``-1`` - and `allow_fill` is True. - - See Also - -------- - numpy.take - api.extensions.take - - Notes - ----- - ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, - ``iloc``, when `indices` is a sequence of values. Additionally, - it's called by :meth:`Series.reindex`, or any other method - that causes realignment, with a `fill_value`. - """ - # TODO: Remove once we got rid of the (indices < 0) check - if not is_array_like(indices): - indices_array = np.asanyarray(indices) - else: - # error: Incompatible types in assignment (expression has type - # "Sequence[int]", variable has type "ndarray") - indices_array = indices # type: ignore[assignment] + """ + raise ValueError( + "MultiIndex has no single backing array. Use " + "'MultiIndex.to_numpy()' to get a NumPy array of tuples." + ) - if len(self._data) == 0 and (indices_array >= 0).any(): - raise IndexError("cannot do a non-empty take") - if indices_array.size > 0 and indices_array.max() >= len(self._data): - raise IndexError("out of bounds value in 'indices'.") + @cache_readonly + def dtypes(self) -> Series: + """ + Return the dtypes as a Series for the underlying MultiIndex. + """ + from pandas import Series - if allow_fill: - fill_mask = indices_array < 0 - if fill_mask.any(): - validate_indices(indices_array, len(self._data)) - # TODO(ARROW-9433): Treat negative indices as NULL - indices_array = pa.array(indices_array, mask=fill_mask) - result = self._data.take(indices_array) - if isna(fill_value): - return type(self)(result) - # TODO: ArrowNotImplementedError: Function fill_null has no - # kernel matching input types (array[string], scalar[string]) - result = type(self)(result) - result[fill_mask] = fill_value - return result - # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) - else: - # Nothing to fill - return type(self)(self._data.take(indices)) - else: # allow_fill=False - # TODO(ARROW-9432): Treat negative indices as indices from the right. - if (indices_array < 0).any(): - # Don't modify in-place - indices_array = np.copy(indices_array) - indices_array[indices_array < 0] += len(self._data) - return type(self)(self._data.take(indices_array)) - - def isin(self, values): - if pa_version_under2p0: - return super().isin(values) - - value_set = [ - pa_scalar.as_py() - for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] - if pa_scalar.type in (pa.string(), pa.null()) - ] + names = com.fill_missing_names([level.name for level in self.levels]) + return Series([level.dtype for level in self.levels], index=names) - # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True - # for null values, so we short-circuit to return all False array. - if not len(value_set): - return np.zeros(len(self), dtype=bool) + def __len__(self) -> int: + return len(self.codes[0]) - kwargs = {} - if pa_version_under3p0: - # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises - # with unexpected keyword argument in pyarrow 3.0.0+ - kwargs["skip_null"] = True + # -------------------------------------------------------------------- + # Levels Methods - result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs) - # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls - # to False - return np.array(result, dtype=np.bool_) + @cache_readonly + def levels(self) -> FrozenList: + # Use cache_readonly to ensure that self.get_locs doesn't repeatedly + # create new IndexEngine + # https://github.com/pandas-dev/pandas/issues/31648 + result = [x._rename(name=name) for x, name in zip(self._levels, self._names)] + for level in result: + # disallow midx.levels[0].name = "foo" + level._no_setting_name = True + return FrozenList(result) - def value_counts(self, dropna: bool = True) -> Series: + def _set_levels( + self, + levels, + *, + level=None, + copy: bool = False, + validate: bool = True, + verify_integrity: bool = False, + ) -> None: + # This is NOT part of the levels property because it should be + # externally not allowed to set levels. User beware if you change + # _levels directly + if validate: + if len(levels) == 0: + raise ValueError("Must set non-zero number of levels.") + if level is None and len(levels) != self.nlevels: + raise ValueError("Length of levels must match number of levels.") + if level is not None and len(levels) != len(level): + raise ValueError("Length of levels must match length of level.") + + if level is None: + new_levels = FrozenList( + ensure_index(lev, copy=copy)._view() for lev in levels + ) + else: + level_numbers = [self._get_level_number(lev) for lev in level] + new_levels_list = list(self._levels) + for lev_num, lev in zip(level_numbers, levels): + new_levels_list[lev_num] = ensure_index(lev, copy=copy)._view() + new_levels = FrozenList(new_levels_list) + + if verify_integrity: + new_codes = self._verify_integrity(levels=new_levels) + self._codes = new_codes + + names = self.names + self._levels = new_levels + if any(names): + self._set_names(names) + + self._reset_cache() + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "levels"]) + def set_levels( + self, levels, level=None, inplace=None, verify_integrity: bool = True + ): """ - Return a Series containing counts of each unique value. + Set new levels on MultiIndex. Defaults to returning new index. Parameters ---------- - dropna : bool, default True - Don't include counts of missing values. + levels : sequence or list of sequence + New level(s) to apply. + level : int, level name, or sequence of int/level names (default None) + Level(s) to set (None for all levels). + inplace : bool + If True, mutates in place. + + .. deprecated:: 1.2.0 + verify_integrity : bool, default True + If True, checks that levels and codes are compatible. Returns ------- - counts : Series + new index (of same type and class...etc) or None + The same type as the caller or None if ``inplace=True``. - See Also + Examples -------- - Series.value_counts + >>> idx = pd.MultiIndex.from_tuples( + ... [ + ... (1, "one"), + ... (1, "two"), + ... (2, "one"), + ... (2, "two"), + ... (3, "one"), + ... (3, "two") + ... ], + ... names=["foo", "bar"] + ... ) + >>> idx + MultiIndex([(1, 'one'), + (1, 'two'), + (2, 'one'), + (2, 'two'), + (3, 'one'), + (3, 'two')], + names=['foo', 'bar']) + + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) + MultiIndex([('a', 1), + ('a', 2), + ('b', 1), + ('b', 2), + ('c', 1), + ('c', 2)], + names=['foo', 'bar']) + >>> idx.set_levels(['a', 'b', 'c'], level=0) + MultiIndex([('a', 'one'), + ('a', 'two'), + ('b', 'one'), + ('b', 'two'), + ('c', 'one'), + ('c', 'two')], + names=['foo', 'bar']) + >>> idx.set_levels(['a', 'b'], level='bar') + MultiIndex([(1, 'a'), + (1, 'b'), + (2, 'a'), + (2, 'b'), + (3, 'a'), + (3, 'b')], + names=['foo', 'bar']) + + If any of the levels passed to ``set_levels()`` exceeds the + existing length, all of the values from that argument will + be stored in the MultiIndex levels, though the values will + be truncated in the MultiIndex output. + + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]) + MultiIndex([('a', 1), + ('a', 2), + ('b', 1), + ('b', 2), + ('c', 1), + ('c', 2)], + names=['foo', 'bar']) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels + FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) """ - from pandas import ( - Index, - Series, - ) - - vc = self._data.value_counts() + if inplace is not None: + warnings.warn( + "inplace is deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + inplace = False - values = vc.field(0) - counts = vc.field(1) - if dropna and self._data.null_count > 0: - mask = values.is_valid() - values = values.filter(mask) - counts = counts.filter(mask) + if is_list_like(levels) and not isinstance(levels, Index): + levels = list(levels) - # No missing values so we can adhere to the interface and return a numpy array. - counts = np.array(counts) + level, levels = _require_listlike(level, levels, "Levels") - index = Index(type(self)(values)) + if inplace: + idx = self + else: + idx = self._view() + idx._reset_identity() + idx._set_levels( + levels, level=level, validate=True, verify_integrity=verify_integrity + ) + if not inplace: + return idx - return Series(counts, index=index).astype("Int64") + @property + def nlevels(self) -> int: + """ + Integer number of levels in this MultiIndex. - def astype(self, dtype, copy: bool = True): - dtype = pandas_dtype(dtype) + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) + >>> mi + MultiIndex([('a', 'b', 'c')], + ) + >>> mi.nlevels + 3 + """ + return len(self._levels) - if is_dtype_equal(dtype, self.dtype): - if copy: - return self.copy() - return self + @property + def levshape(self) -> Shape: + """ + A tuple with the length of each level. - elif isinstance(dtype, NumericDtype): - data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype)) - return dtype.__from_arrow__(data) + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) + >>> mi + MultiIndex([('a', 'b', 'c')], + ) + >>> mi.levshape + (1, 1, 1) + """ + return tuple(len(x) for x in self.levels) - return super().astype(dtype, copy=copy) + # -------------------------------------------------------------------- + # Codes Methods - # ------------------------------------------------------------------------ - # String methods interface + @property + def codes(self): + return self._codes - # error: Cannot determine type of 'na_value' - _str_na_value = StringDtype.na_value # type: ignore[has-type] + def _set_codes( + self, + codes, + *, + level=None, + copy: bool = False, + validate: bool = True, + verify_integrity: bool = False, + ) -> None: + if validate: + if level is None and len(codes) != self.nlevels: + raise ValueError("Length of codes must match number of levels") + if level is not None and len(codes) != len(level): + raise ValueError("Length of codes must match length of levels.") + + if level is None: + new_codes = FrozenList( + _coerce_indexer_frozen(level_codes, lev, copy=copy).view() + for lev, level_codes in zip(self._levels, codes) + ) + else: + level_numbers = [self._get_level_number(lev) for lev in level] + new_codes_list = list(self._codes) + for lev_num, level_codes in zip(level_numbers, codes): + lev = self.levels[lev_num] + new_codes_list[lev_num] = _coerce_indexer_frozen( + level_codes, lev, copy=copy + ) + new_codes = FrozenList(new_codes_list) - def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - # TODO: de-duplicate with StringArray method. This method is moreless copy and - # paste. + if verify_integrity: + new_codes = self._verify_integrity(codes=new_codes) - from pandas.arrays import ( - BooleanArray, - IntegerArray, - ) + self._codes = new_codes - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value + self._reset_cache() - mask = isna(self) - arr = np.asarray(self) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "codes"]) + def set_codes(self, codes, level=None, inplace=None, verify_integrity: bool = True): + """ + Set new codes on MultiIndex. Defaults to returning new index. - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray] | type[BooleanArray] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected - # "Type[object]" - dtype=np.dtype(dtype), # type: ignore[arg-type] - ) + Parameters + ---------- + codes : sequence or list of sequence + New codes to apply. + level : int, level name, or sequence of int/level names (default None) + Level(s) to set (None for all levels). + inplace : bool + If True, mutates in place. - if not na_value_is_na: - mask[:] = False + .. deprecated:: 1.2.0 + verify_integrity : bool, default True + If True, checks that levels and codes are compatible. - return constructor(result, mask) + Returns + ------- + new index (of same type and class...etc) or None + The same type as the caller or None if ``inplace=True``. - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value + Examples + -------- + >>> idx = pd.MultiIndex.from_tuples( + ... [(1, "one"), (1, "two"), (2, "one"), (2, "two")], names=["foo", "bar"] + ... ) + >>> idx + MultiIndex([(1, 'one'), + (1, 'two'), + (2, 'one'), + (2, 'two')], + names=['foo', 'bar']) + + >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]]) + MultiIndex([(2, 'one'), + (1, 'one'), + (2, 'two'), + (1, 'two')], + names=['foo', 'bar']) + >>> idx.set_codes([1, 0, 1, 0], level=0) + MultiIndex([(2, 'one'), + (1, 'two'), + (2, 'one'), + (1, 'two')], + names=['foo', 'bar']) + >>> idx.set_codes([0, 0, 1, 1], level='bar') + MultiIndex([(1, 'one'), + (1, 'one'), + (2, 'two'), + (2, 'two')], + names=['foo', 'bar']) + >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]], level=[0, 1]) + MultiIndex([(2, 'one'), + (1, 'one'), + (2, 'two'), + (1, 'two')], + names=['foo', 'bar']) + """ + if inplace is not None: + warnings.warn( + "inplace is deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), ) - result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) - return type(self)(result) else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) - - def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): - if flags: - return super()._str_contains(pat, case, flags, na, regex) - - if regex: - if pa_version_under4p0 or case is False: - return super()._str_contains(pat, case, flags, na, regex) - else: - result = pc.match_substring_regex(self._data, pat) + inplace = False + + level, codes = _require_listlike(level, codes, "Codes") + + if inplace: + idx = self else: - if case: - result = pc.match_substring(self._data, pat) - else: - result = pc.match_substring(pc.utf8_upper(self._data), pat.upper()) - result = BooleanDtype().__from_arrow__(result) - if not isna(na): - result[isna(result)] = bool(na) + idx = self._view() + idx._reset_identity() + idx._set_codes(codes, level=level, verify_integrity=verify_integrity) + if not inplace: + return idx + + # -------------------------------------------------------------------- + # Index Internals + + @cache_readonly + def _engine(self): + # Calculate the number of bits needed to represent labels in each + # level, as log2 of their sizes (including -1 for NaN): + sizes = np.ceil(np.log2([len(level) + 1 for level in self.levels])) + + # Sum bit counts, starting from the _right_.... + lev_bits = np.cumsum(sizes[::-1])[::-1] + + # ... in order to obtain offsets such that sorting the combination of + # shifted codes (one for each level, resulting in a unique integer) is + # equivalent to sorting lexicographically the codes themselves. Notice + # that each level needs to be shifted by the number of bits needed to + # represent the _previous_ ones: + offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64") + + # Check the total number of bits needed for our representation: + if lev_bits[0] > 64: + # The levels would overflow a 64 bit uint - use Python integers: + return MultiIndexPyIntEngine(self.levels, self.codes, offsets) + return MultiIndexUIntEngine(self.levels, self.codes, offsets) + + # Return type "Callable[..., MultiIndex]" of "_constructor" incompatible with return + # type "Type[MultiIndex]" in supertype "Index" + @property + def _constructor(self) -> Callable[..., MultiIndex]: # type: ignore[override] + return type(self).from_tuples + + @doc(Index._shallow_copy) + def _shallow_copy(self, values: np.ndarray, name=lib.no_default) -> MultiIndex: + names = name if name is not lib.no_default else self.names + + return type(self).from_tuples(values, sortorder=None, names=names) + + def _view(self) -> MultiIndex: + result = type(self)( + levels=self.levels, + codes=self.codes, + sortorder=self.sortorder, + names=self.names, + verify_integrity=False, + ) + result._cache = self._cache.copy() + result._cache.pop("levels", None) # GH32669 return result - def _str_startswith(self, pat: str, na=None): - if pa_version_under4p0: - return super()._str_startswith(pat, na) + # -------------------------------------------------------------------- + + def copy( + self, + names=None, + dtype=None, + levels=None, + codes=None, + deep=False, + name=None, + ): + """ + Make a copy of this object. Names, dtype, levels and codes can be + passed and will be set on new copy. + + Parameters + ---------- + names : sequence, optional + dtype : numpy dtype or pandas type, optional - pat = "^" + re.escape(pat) - return self._str_contains(pat, na=na, regex=True) + .. deprecated:: 1.2.0 + levels : sequence, optional - def _str_endswith(self, pat: str, na=None): - if pa_version_under4p0: - return super()._str_endswith(pat, na) + .. deprecated:: 1.2.0 + codes : sequence, optional - pat = re.escape(pat) + "$" - return self._str_contains(pat, na=na, regex=True) + .. deprecated:: 1.2.0 + deep : bool, default False + name : Label + Kept for compatibility with 1-dimensional Index. Should not be used. - def _str_replace( - self, - pat: str | re.Pattern, - repl: str | Callable, - n: int = -1, - case: bool = True, - flags: int = 0, - regex: bool = True, - ): - if ( - pa_version_under4p0 - or isinstance(pat, re.Pattern) - or callable(repl) - or not case - or flags - ): - return super()._str_replace(pat, repl, n, case, flags, regex) + Returns + ------- + MultiIndex + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + This could be potentially expensive on large MultiIndex objects. + """ + names = self._validate_names(name=name, names=names, deep=deep) + if levels is not None: + warnings.warn( + "parameter levels is deprecated and will be removed in a future " + "version. Use the set_levels method instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if codes is not None: + warnings.warn( + "parameter codes is deprecated and will be removed in a future " + "version. Use the set_codes method instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) - func = pc.replace_substring_regex if regex else pc.replace_substring - result = func(self._data, pattern=pat, replacement=repl, max_replacements=n) - return type(self)(result) + if deep: + from copy import deepcopy - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if pa_version_under4p0: - return super()._str_match(pat, case, flags, na) + if levels is None: + levels = deepcopy(self.levels) + if codes is None: + codes = deepcopy(self.codes) - if not pat.startswith("^"): - pat = "^" + pat - return self._str_contains(pat, case, flags, na, regex=True) + levels = levels if levels is not None else self.levels + codes = codes if codes is not None else self.codes - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if pa_version_under4p0: - return super()._str_fullmatch(pat, case, flags, na) + new_index = type(self)( + levels=levels, + codes=codes, + sortorder=self.sortorder, + names=names, + verify_integrity=False, + ) + new_index._cache = self._cache.copy() + new_index._cache.pop("levels", None) # GH32669 + + if dtype: + warnings.warn( + "parameter dtype is deprecated and will be removed in a future " + "version. Use the astype method instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + new_index = new_index.astype(dtype) + return new_index - if not pat.endswith("$") or pat.endswith("//$"): - pat = pat + "$" - return self._str_match(pat, case, flags, na) + def __array__(self, dtype=None) -> np.ndarray: + """the array interface, return my values""" + return self.values - def _str_isalnum(self): - result = pc.utf8_is_alnum(self._data) - return BooleanDtype().__from_arrow__(result) + def view(self, cls=None): + """this is defined as a copy with the same identity""" + result = self.copy() + result._id = self._id + return result - def _str_isalpha(self): - result = pc.utf8_is_alpha(self._data) - return BooleanDtype().__from_arrow__(result) + @doc(Index.__contains__) + def __contains__(self, key: Any) -> bool: + hash(key) + try: + self.get_loc(key) + return True + except (LookupError, TypeError, ValueError): + return False - def _str_isdecimal(self): - result = pc.utf8_is_decimal(self._data) - return BooleanDtype().__from_arrow__(result) + @cache_readonly + def dtype(self) -> np.dtype: + return np.dtype("O") - def _str_isdigit(self): - result = pc.utf8_is_digit(self._data) - return BooleanDtype().__from_arrow__(result) + def _is_memory_usage_qualified(self) -> bool: + """return a boolean if we need a qualified .info display""" - def _str_islower(self): - result = pc.utf8_is_lower(self._data) - return BooleanDtype().__from_arrow__(result) + def f(level): + return "mixed" in level or "string" in level or "unicode" in level - def _str_isnumeric(self): - result = pc.utf8_is_numeric(self._data) - return BooleanDtype().__from_arrow__(result) + return any(f(level) for level in self._inferred_type_levels) - def _str_isspace(self): - if pa_version_under2p0: - return super()._str_isspace() + @doc(Index.memory_usage) + def memory_usage(self, deep: bool = False) -> int: + # we are overwriting our base class to avoid + # computing .values here which could materialize + # a tuple representation unnecessarily + return self._nbytes(deep) - result = pc.utf8_is_space(self._data) - return BooleanDtype().__from_arrow__(result) + @cache_readonly + def nbytes(self) -> int: + """return the number of bytes in the underlying data""" + return self._nbytes(False) - def _str_istitle(self): - result = pc.utf8_is_title(self._data) - return BooleanDtype().__from_arrow__(result) + def _nbytes(self, deep: bool = False) -> int: + """ + return the number of bytes in the underlying data + deeply introspect the level data if deep=True - def _str_isupper(self): - result = pc.utf8_is_upper(self._data) - return BooleanDtype().__from_arrow__(result) + include the engine hashtable - def _str_len(self): - if pa_version_under4p0: - return super()._str_len() + *this is in internal routine* - result = pc.utf8_length(self._data) - return Int64Dtype().__from_arrow__(result) + """ + # for implementations with no useful getsizeof (PyPy) + objsize = 24 - def _str_lower(self): - return type(self)(pc.utf8_lower(self._data)) + level_nbytes = sum(i.memory_usage(deep=deep) for i in self.levels) + label_nbytes = sum(i.nbytes for i in self.codes) + names_nbytes = sum(getsizeof(i, objsize) for i in self.names) + result = level_nbytes + label_nbytes + names_nbytes - def _str_upper(self): - return type(self)(pc.utf8_upper(self._data)) + # include our engine hashtable + result += self._engine.sizeof(deep=deep) + return result - def _str_strip(self, to_strip=None): - if pa_version_under4p0: - return super()._str_strip(to_strip) + # -------------------------------------------------------------------- + # Rendering Methods - if to_strip is None: - result = pc.utf8_trim_whitespace(self._data) + def _formatter_func(self, tup): + """ + Formats each item in tup according to its level's formatter function. + """ + formatter_funcs = [level._formatter_func for level in self.levels] + return tuple(func(val) for func, val in zip(formatter_funcs, tup)) + + def _format_native_types(self, *, na_rep="nan", **kwargs): + new_levels = [] + new_codes = [] + + # go through the levels and format them + for level, level_codes in zip(self.levels, self.codes): + level_strs = level._format_native_types(na_rep=na_rep, **kwargs) + # add nan values, if there are any + mask = level_codes == -1 + if mask.any(): + nan_index = len(level_strs) + # numpy 1.21 deprecated implicit string casting + level_strs = level_strs.astype(str) + level_strs = np.append(level_strs, na_rep) + assert not level_codes.flags.writeable # i.e. copy is needed + level_codes = level_codes.copy() # make writeable + level_codes[mask] = nan_index + new_levels.append(level_strs) + new_codes.append(level_codes) + + if len(new_levels) == 1: + # a single-level multi-index + return Index(new_levels[0].take(new_codes[0]))._format_native_types() else: - result = pc.utf8_trim(self._data, characters=to_strip) - return type(self)(result) + # reconstruct the multi-index + mi = MultiIndex( + levels=new_levels, + codes=new_codes, + names=self.names, + sortorder=self.sortorder, + verify_integrity=False, + ) + return mi._values + + def format( + self, + name: bool | None = None, + formatter: Callable | None = None, + na_rep: str | None = None, + names: bool = False, + space: int = 2, + sparsify=None, + adjoin: bool = True, + ) -> list: + if name is not None: + names = name + + if len(self) == 0: + return [] + + stringified_levels = [] + for lev, level_codes in zip(self.levels, self.codes): + na = na_rep if na_rep is not None else _get_na_rep(lev.dtype.type) + + if len(lev) > 0: + + formatted = lev.take(level_codes).format(formatter=formatter) + + # we have some NA + mask = level_codes == -1 + if mask.any(): + formatted = np.array(formatted, dtype=object) + formatted[mask] = na + formatted = formatted.tolist() + + else: + # weird all NA case + formatted = [ + pprint_thing(na if isna(x) else x, escape_chars=("\t", "\r", "\n")) + for x in algos.take_nd(lev._values, level_codes) + ] + stringified_levels.append(formatted) + + result_levels = [] + for lev, lev_name in zip(stringified_levels, self.names): + level = [] + + if names: + level.append( + pprint_thing(lev_name, escape_chars=("\t", "\r", "\n")) + if lev_name is not None + else "" + ) + + level.extend(np.array(lev, dtype=object)) + result_levels.append(level) + + if sparsify is None: + sparsify = get_option("display.multi_sparse") + + if sparsify: + sentinel = "" + # GH3547 use value of sparsify as sentinel if it's "Falsey" + assert isinstance(sparsify, bool) or sparsify is lib.no_default + if sparsify in [False, lib.no_default]: + sentinel = sparsify + # little bit of a kludge job for #1217 + result_levels = sparsify_labels( + result_levels, start=int(names), sentinel=sentinel + ) - def _str_lstrip(self, to_strip=None): - if pa_version_under4p0: - return super()._str_lstrip(to_strip) + if adjoin: + from pandas.io.formats.format import get_adjustment - if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._data) + adj = get_adjustment() + return adj.adjoin(space, *result_levels).split("\n") else: - result = pc.utf8_ltrim(self._data, characters=to_strip) - return type(self)(result) + return result_levels + + # -------------------------------------------------------------------- + # Names Methods + + def _get_names(self) -> FrozenList: + return FrozenList(self._names) + + def _set_names(self, names, *, level=None, validate: bool = True): + """ + Set new names on index. Each name has to be a hashable type. + + Parameters + ---------- + values : str or sequence + name(s) to set + level : int, level name, or sequence of int/level names (default None) + If the index is a MultiIndex (hierarchical), level(s) to set (None + for all levels). Otherwise level must be None + validate : bool, default True + validate that the names match level lengths + + Raises + ------ + TypeError if each name is not hashable. + + Notes + ----- + sets names on levels. WARNING: mutates! - def _str_rstrip(self, to_strip=None): - if pa_version_under4p0: - return super()._str_rstrip(to_strip) + Note that you generally want to set this *after* changing levels, so + that it only acts on copies + """ + # GH 15110 + # Don't allow a single string for names in a MultiIndex + if names is not None and not is_list_like(names): + raise ValueError("Names should be list-like for a MultiIndex") + names = list(names) + + if validate: + if level is not None and len(names) != len(level): + raise ValueError("Length of names must match length of level.") + if level is None and len(names) != self.nlevels: + raise ValueError( + "Length of names must match number of levels in MultiIndex." + ) + + if level is None: + level = range(self.nlevels) + else: + level = [self._get_level_number(lev) for lev in level] + + # set the name + for lev, name in zip(level, names): + if name is not None: + # GH 20527 + # All items in 'names' need to be hashable: + if not is_hashable(name): + raise TypeError( + f"{type(self).__name__}.name must be a hashable type" + ) + # error: Cannot determine type of '__setitem__' + self._names[lev] = name # type: ignore[has-type] + + # If .levels has been accessed, the names in our cache will be stale. + self._reset_cache() + + names = property( + fset=_set_names, + fget=_get_names, + doc=""" + Names of levels in MultiIndex. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays( + ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) + >>> mi + MultiIndex([(1, 3, 5), + (2, 4, 6)], + names=['x', 'y', 'z']) + >>> mi.names + FrozenList(['x', 'y', 'z']) + """, + ) + + # -------------------------------------------------------------------- + + @doc(Index._get_grouper_for_level) + def _get_grouper_for_level( + self, mapper, *, level=None + ) -> tuple[Index, npt.NDArray[np.signedinteger] | None, Index | None]: + indexer = self.codes[level] + level_index = self.levels[level] + + if mapper is not None: + # Handle group mapping function and return + level_values = self.levels[level].take(indexer) + grouper = level_values.map(mapper) + return grouper, None, None + + codes, uniques = algos.factorize(indexer, sort=True) + + if len(uniques) > 0 and uniques[0] == -1: + # Handle NAs + mask = indexer != -1 + ok_codes, uniques = algos.factorize(indexer[mask], sort=True) + + codes = np.empty(len(indexer), dtype=indexer.dtype) + codes[mask] = ok_codes + codes[~mask] = -1 + + if len(uniques) < len(level_index): + # Remove unobserved levels from level_index + level_index = level_index.take(uniques) + else: + # break references back to us so that setting the name + # on the output of a groupby doesn't reflect back here. + level_index = level_index.copy() - if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._data) + if level_index._can_hold_na: + grouper = level_index.take(codes, fill_value=True) else: - result = pc.utf8_rtrim(self._data, characters=to_strip) - return type(self)(result) + grouper = level_index.take(codes) + + return grouper, codes, level_index + + @cache_readonly + def inferred_type(self) -> str: + return "mixed" + + def _get_level_number(self, level) -> int: + count = self.names.count(level) + if (count > 1) and not is_integer(level): + raise ValueError( + f"The name {level} occurs multiple times, use a level number" + ) + try: + level = self.names.index(level) + except ValueError as err: + if not is_integer(level): + raise KeyError(f"Level {level} not found") from err + elif level < 0: + level += self.nlevels + if level < 0: + orig_level = level - self.nlevels + raise IndexError( + f"Too many levels: Index has only {self.nlevels} levels, " + f"{orig_level} is not a valid level number" + ) from err + # Note: levels are zero-based + elif level >= self.nlevels: + raise IndexError( + f"Too many levels: Index has only {self.nlevels} levels, " + f"not {level + 1}" + ) from err + return level + + @cache_readonly + def is_monotonic_increasing(self) -> bool: + """ + return if the index is monotonic increasing (only equal or + increasing) values. + """ + if any(-1 in code for code in self.codes): + return False + + if all(level.is_monotonic_increasing for level in self.levels): + # If each level is sorted, we can operate on the codes directly. GH27495 + return libalgos.is_lexsorted( + [x.astype("int64", copy=False) for x in self.codes] + ) + + # reversed() because lexsort() wants the most significant key last. + values = [ + self._get_level_values(i)._values for i in reversed(range(len(self.levels))) + ] + try: + # Argument 1 to "lexsort" has incompatible type "List[Union[ExtensionArray, + # ndarray[Any, Any]]]"; expected "Union[_SupportsArray[dtype[Any]], + # _NestedSequence[_SupportsArray[dtype[Any]]], bool, + # int, float, complex, str, bytes, _NestedSequence[Union[bool, int, float, + # complex, str, bytes]]]" [arg-type] + sort_order = np.lexsort(values) # type: ignore[arg-type] + return Index(sort_order).is_monotonic_increasing + except TypeError: + + # we have mixed types and np.lexsort is not happy + return Index(self._values).is_monotonic_increasing + + @cache_readonly + def is_monotonic_decreasing(self) -> bool: + """ + return if the index is monotonic decreasing (only equal or + decreasing) values. + """ + # monotonic decreasing if and only if reverse is monotonic increasing + return self[::-1].is_monotonic_increasing + + @cache_readonly + def _inferred_type_levels(self) -> list[str]: + """return a list of the inferred types, one for each level""" + return [i.inferred_type for i in self.levels] + + @doc(Index.duplicated) + def duplicated(self, keep="first") -> npt.NDArray[np.bool_]: + shape = tuple(len(lev) for lev in self.levels) + ids = get_group_index(self.codes, shape, sort=False, xnull=False) + + return duplicated(ids, keep) + + # error: Cannot override final attribute "_duplicated" + # (previously declared in base class "IndexOpsMixin") + _duplicated = duplicated # type: ignore[misc] + + def fillna(self, value=None, downcast=None): + """ + fillna is not implemented for MultiIndex + """ + raise NotImplementedError("isna is not defined for MultiIndex") + + @doc(Index.dropna) + def dropna(self, how: str = "any") -> MultiIndex: + nans = [level_codes == -1 for level_codes in self.codes] + if how == "any": + indexer = np.any(nans, axis=0) + elif how == "all": + indexer = np.all(nans, axis=0) + else: + raise ValueError(f"invalid how option: {how}") + + new_codes = [level_codes[~indexer] for level_codes in self.codes] + return self.set_codes(codes=new_codes) + + def _get_level_values(self, level: int, unique: bool = False) -> Index: + """ + Return vector of label values for requested level, + equal to the length of the index + + **this is an internal method** + + Parameters + ---------- + level : int + unique : bool, default False + if True, drop duplicated values + + Returns + ------- + Index + """ + lev = self.levels[level] + level_codes = self.codes[level] + name = self._names[level] + if unique: + level_codes = algos.unique(level_codes) + filled = algos.take_nd(lev._values, level_codes, fill_value=lev._na_value) + return lev._shallow_copy(filled, name=name) + + def get_level_values(self, level): + """ + Return vector of label values for requested level. + + Length of returned vector is equal to the length of the index. + + Parameters + ---------- + level : int or str + ``level`` is either the integer position of the level in the + MultiIndex, or the name of the level. + + Returns + ------- + values : Index + Values is a level of this MultiIndex converted to + a single :class:`Index` (or subclass thereof). + + Notes + ----- + If the level contains missing values, the result may be casted to + ``float`` with missing values specified as ``NaN``. This is because + the level is converted to a regular ``Index``. + + Examples + -------- + Create a MultiIndex: + + >>> mi = pd.MultiIndex.from_arrays((list('abc'), list('def'))) + >>> mi.names = ['level_1', 'level_2'] + + Get level values by supplying level as either integer or name: + + >>> mi.get_level_values(0) + Index(['a', 'b', 'c'], dtype='object', name='level_1') + >>> mi.get_level_values('level_2') + Index(['d', 'e', 'f'], dtype='object', name='level_2') + + If a level contains missing values, the return type of the level + maybe casted to ``float``. + + >>> pd.MultiIndex.from_arrays([[1, None, 2], [3, 4, 5]]).dtypes + level_0 int64 + level_1 int64 + dtype: object + >>> pd.MultiIndex.from_arrays([[1, None, 2], [3, 4, 5]]).get_level_values(0) + Float64Index([1.0, nan, 2.0], dtype='float64') + """ + level = self._get_level_number(level) + values = self._get_level_values(level) + return values + + @doc(Index.unique) + def unique(self, level=None): + + if level is None: + return super().unique() + else: + level = self._get_level_number(level) + return self._get_level_values(level=level, unique=True) + + def to_frame( + self, + index: bool = True, + name=lib.no_default, + allow_duplicates: bool = False, + ) -> DataFrame: + """ + Create a DataFrame with the levels of the MultiIndex as columns. + + Column ordering is determined by the DataFrame constructor with data as + a dict. + + Parameters + ---------- + index : bool, default True + Set the index of the returned DataFrame as the original MultiIndex. + + name : list / sequence of str, optional + The passed names should substitute index level names. + + allow_duplicates : bool, optional default False + Allow duplicate column labels to be created. + + .. versionadded:: 1.5.0 + + Returns + ------- + DataFrame : a DataFrame containing the original MultiIndex data. + + See Also + -------- + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous + tabular data. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([['a', 'b'], ['c', 'd']]) + >>> mi + MultiIndex([('a', 'c'), + ('b', 'd')], + ) + + >>> df = mi.to_frame() + >>> df + 0 1 + a c a c + b d b d + + >>> df = mi.to_frame(index=False) + >>> df + 0 1 + 0 a c + 1 b d + + >>> df = mi.to_frame(name=['x', 'y']) + >>> df + x y + a c a c + b d b d + """ + from pandas import DataFrame + + if name is None: + warnings.warn( + "Explicitly passing `name=None` currently preserves the Index's name " + "or uses a default name of 0. This behaviour is deprecated, and in " + "the future `None` will be used as the name of the resulting " + "DataFrame column.", + FutureWarning, + stacklevel=find_stack_level(), + ) + name = lib.no_default + + if name is not lib.no_default: + if not is_list_like(name): + raise TypeError("'name' must be a list / sequence of column names.") + + if len(name) != len(self.levels): + raise ValueError( + "'name' should have same length as number of levels on index." + ) + idx_names = name + else: + idx_names = self._get_level_names() + + if not allow_duplicates and len(set(idx_names)) != len(idx_names): + raise ValueError( + "Cannot create duplicate column labels if allow_duplicates is False" + ) + + # Guarantee resulting column order - PY36+ dict maintains insertion order + result = DataFrame( + {level: self._get_level_values(level) for level in range(len(self.levels))}, + copy=False, + ) + result.columns = idx_names + + if index: + result.index = self + return result + + def to_flat_index(self) -> Index: + """ + Convert a MultiIndex to an Index of Tuples containing the level values. + + Returns + ------- + pd.Index + Index with the MultiIndex data represented in Tuples. + + See Also + -------- + MultiIndex.from_tuples : Convert flat index back to MultiIndex. + + Notes + ----- + This method will simply return the caller if called by anything other + than a MultiIndex. + + Examples + -------- + >>> index = pd.MultiIndex.from_product( + ... [['foo', 'bar'], ['baz', 'qux']], + ... names=['a', 'b']) + >>> index.to_flat_index() + Index([('foo', 'baz'), ('foo', 'qux'), + ('bar', 'baz'), ('bar', 'qux')], + dtype='object') + """ + return Index(self._values, tupleize_cols=False) + + def is_lexsorted(self) -> bool: + warnings.warn( + "MultiIndex.is_lexsorted is deprecated as a public function, " + "users should use MultiIndex.is_monotonic_increasing instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._is_lexsorted() + + def _is_lexsorted(self) -> bool: + """ + Return True if the codes are lexicographically sorted. + + Returns + ------- + bool + + Examples + -------- + In the below examples, the first level of the MultiIndex is sorted because + a>> pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['d', 'e', 'f']]).is_lexsorted() + True + >>> pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['d', 'f', 'e']]).is_lexsorted() + True + + In case there is a tie, the lexicographical sorting looks + at the next level of the MultiIndex. + + >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'b', 'c']]).is_lexsorted() + True + >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'c', 'b']]).is_lexsorted() + False + >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], + ... ['aa', 'bb', 'aa', 'bb']]).is_lexsorted() + True + >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], + ... ['bb', 'aa', 'aa', 'bb']]).is_lexsorted() + False + """ + return self._lexsort_depth == self.nlevels + + @property + def lexsort_depth(self) -> int: + warnings.warn( + "MultiIndex.is_lexsorted is deprecated as a public function, " + "users should use MultiIndex.is_monotonic_increasing instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._lexsort_depth + + @cache_readonly + def _lexsort_depth(self) -> int: + """ + Compute and return the lexsort_depth, the number of levels of the + MultiIndex that are sorted lexically + + Returns + ------- + int + """ + if self.sortorder is not None: + return self.sortorder + return _lexsort_depth(self.codes, self.nlevels) + + def _sort_levels_monotonic(self) -> MultiIndex: + """ + This is an *internal* function. + + Create a new MultiIndex from the current to monotonically sorted + items IN the levels. This does not actually make the entire MultiIndex + monotonic, JUST the levels. + + The resulting MultiIndex will have the same outward + appearance, meaning the same .values and ordering. It will also + be .equals() to the original. + + Returns + ------- + MultiIndex + + Examples + -------- + >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> mi + MultiIndex([('a', 'bb'), + ('a', 'aa'), + ('b', 'bb'), + ('b', 'aa')], + ) + + >>> mi.sort_values() + MultiIndex([('a', 'aa'), + ('a', 'bb'), + ('b', 'aa'), + ('b', 'bb')], + ) + """ + if self._is_lexsorted() and self.is_monotonic_increasing: + return self + + new_levels = [] + new_codes = [] + + for lev, level_codes in zip(self.levels, self.codes): + + if not lev.is_monotonic_increasing: + try: + # indexer to reorder the levels + indexer = lev.argsort() + except TypeError: + pass + else: + lev = lev.take(indexer) + + # indexer to reorder the level codes + indexer = ensure_platform_int(indexer) + ri = lib.get_reverse_indexer(indexer, len(indexer)) + level_codes = algos.take_nd(ri, level_codes) + + new_levels.append(lev) + new_codes.append(level_codes) + + return MultiIndex( + new_levels, + new_codes, + names=self.names, + sortorder=self.sortorder, + verify_integrity=False, + ) + + def remove_unused_levels(self) -> MultiIndex: + """ + Create new MultiIndex from current that removes unused levels. + + Unused level(s) means levels that are not expressed in the + labels. The resulting MultiIndex will have the same outward + appearance, meaning the same .values and ordering. It will + also be .equals() to the original. + + Returns + ------- + MultiIndex + + Examples + -------- + >>> mi = pd.MultiIndex.from_product([range(2), list('ab')]) + >>> mi + MultiIndex([(0, 'a'), + (0, 'b'), + (1, 'a'), + (1, 'b')], + ) + + >>> mi[2:] + MultiIndex([(1, 'a'), + (1, 'b')], + ) + + The 0 from the first level is not represented + and can be removed + + >>> mi2 = mi[2:].remove_unused_levels() + >>> mi2.levels + FrozenList([[1], ['a', 'b']]) + """ + new_levels = [] + new_codes = [] + + changed = False + for lev, level_codes in zip(self.levels, self.codes): + + # Since few levels are typically unused, bincount() is more + # efficient than unique() - however it only accepts positive values + # (and drops order): + uniques = np.where(np.bincount(level_codes + 1) > 0)[0] - 1 + has_na = int(len(uniques) and (uniques[0] == -1)) + + if len(uniques) != len(lev) + has_na: + + if lev.isna().any() and len(uniques) == len(lev): + break + # We have unused levels + changed = True + + # Recalculate uniques, now preserving order. + # Can easily be cythonized by exploiting the already existing + # "uniques" and stop parsing "level_codes" when all items + # are found: + uniques = algos.unique(level_codes) + if has_na: + na_idx = np.where(uniques == -1)[0] + # Just ensure that -1 is in first position: + uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]] + + # codes get mapped from uniques to 0:len(uniques) + # -1 (if present) is mapped to last position + code_mapping = np.zeros(len(lev) + has_na) + # ... and reassigned value -1: + code_mapping[uniques] = np.arange(len(uniques)) - has_na + + level_codes = code_mapping[level_codes] + + # new levels are simple + lev = lev.take(uniques[has_na:]) + + new_levels.append(lev) + new_codes.append(level_codes) + + result = self.view() + + if changed: + result._reset_identity() + result._set_levels(new_levels, validate=False) + result._set_codes(new_codes, validate=False) + + return result + + # -------------------------------------------------------------------- + # Pickling Methods + + def __reduce__(self): + """Necessary for making this object picklable""" + d = { + "levels": list(self.levels), + "codes": list(self.codes), + "sortorder": self.sortorder, + "names": list(self.names), + } + return ibase._new_Index, (type(self), d), None + + # -------------------------------------------------------------------- + + def __getitem__(self, key): + if is_scalar(key): + key = com.cast_scalar_indexer(key, warn_float=True) + + retval = [] + for lev, level_codes in zip(self.levels, self.codes): + if level_codes[key] == -1: + retval.append(np.nan) + else: + retval.append(lev[level_codes[key]]) + + return tuple(retval) + else: + # in general cannot be sure whether the result will be sorted + sortorder = None + if com.is_bool_indexer(key): + key = np.asarray(key, dtype=bool) + sortorder = self.sortorder + elif isinstance(key, slice): + if key.step is None or key.step > 0: + sortorder = self.sortorder + elif isinstance(key, Index): + key = np.asarray(key) + + new_codes = [level_codes[key] for level_codes in self.codes] + + return MultiIndex( + levels=self.levels, + codes=new_codes, + names=self.names, + sortorder=sortorder, + verify_integrity=False, + ) + + def _getitem_slice(self: MultiIndex, slobj: slice) -> MultiIndex: + """ + Fastpath for __getitem__ when we know we have a slice. + """ + sortorder = None + if slobj.step is None or slobj.step > 0: + sortorder = self.sortorder + + new_codes = [level_codes[slobj] for level_codes in self.codes] + + return type(self)( + levels=self.levels, + codes=new_codes, + names=self._names, + sortorder=sortorder, + verify_integrity=False, + ) + + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take( + self: MultiIndex, + indices, + axis: int = 0, + allow_fill: bool = True, + fill_value=None, + **kwargs, + ) -> MultiIndex: + nv.validate_take((), kwargs) + indices = ensure_platform_int(indices) + + # only fill if we are passing a non-None fill_value + allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices) + + na_value = -1 + + taken = [lab.take(indices) for lab in self.codes] + if allow_fill: + mask = indices == -1 + if mask.any(): + masked = [] + for new_label in taken: + label_values = new_label + label_values[mask] = na_value + masked.append(np.asarray(label_values)) + taken = masked + + return MultiIndex( + levels=self.levels, codes=taken, names=self.names, verify_integrity=False + ) + + def append(self, other): + """ + Append a collection of Index options together + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + """ + if not isinstance(other, (list, tuple)): + other = [other] + + if all( + (isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) for o in other + ): + arrays = [] + for i in range(self.nlevels): + label = self._get_level_values(i) + appended = [o._get_level_values(i) for o in other] + arrays.append(label.append(appended)) + return MultiIndex.from_arrays(arrays, names=self.names) + + to_concat = (self._values,) + tuple(k._values for k in other) + new_tuples = np.concatenate(to_concat) + + # if all(isinstance(x, MultiIndex) for x in other): + try: + return MultiIndex.from_tuples(new_tuples, names=self.names) + except (TypeError, IndexError): + return Index._with_infer(new_tuples) + + def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: + return self._values.argsort(*args, **kwargs) + + @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) + def repeat(self, repeats: int, axis=None) -> MultiIndex: + nv.validate_repeat((), {"axis": axis}) + # error: Incompatible types in assignment (expression has type "ndarray", + # variable has type "int") + repeats = ensure_platform_int(repeats) # type: ignore[assignment] + return MultiIndex( + levels=self.levels, + codes=[ + level_codes.view(np.ndarray).astype(np.intp, copy=False).repeat(repeats) + for level_codes in self.codes + ], + names=self.names, + sortorder=self.sortorder, + verify_integrity=False, + ) + + def drop(self, codes, level=None, errors="raise"): + """ + Make new MultiIndex with passed list of codes deleted + + Parameters + ---------- + codes : array-like + Must be a list of tuples when level is not specified + level : int or level name, default None + errors : str, default 'raise' + + Returns + ------- + dropped : MultiIndex + """ + if level is not None: + return self._drop_from_level(codes, level, errors) + + if not isinstance(codes, (np.ndarray, Index)): + try: + codes = com.index_labels_to_array(codes, dtype=np.dtype("object")) + except ValueError: + pass + + inds = [] + for level_codes in codes: + try: + loc = self.get_loc(level_codes) + # get_loc returns either an integer, a slice, or a boolean + # mask + if isinstance(loc, int): + inds.append(loc) + elif isinstance(loc, slice): + step = loc.step if loc.step is not None else 1 + inds.extend(range(loc.start, loc.stop, step)) + elif com.is_bool_indexer(loc): + if self._lexsort_depth == 0: + warnings.warn( + "dropping on a non-lexsorted multi-index " + "without a level parameter may impact performance.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) + loc = loc.nonzero()[0] + inds.extend(loc) + else: + msg = f"unsupported indexer of type {type(loc)}" + raise AssertionError(msg) + except KeyError: + if errors != "ignore": + raise + + return self.delete(inds) + + def _drop_from_level(self, codes, level, errors="raise") -> MultiIndex: + codes = com.index_labels_to_array(codes) + i = self._get_level_number(level) + index = self.levels[i] + values = index.get_indexer(codes) + # If nan should be dropped it will equal -1 here. We have to check which values + # are not nan and equal -1, this means they are missing in the index + nan_codes = isna(codes) + values[(np.equal(nan_codes, False)) & (values == -1)] = -2 + if index.shape[0] == self.shape[0]: + values[np.equal(nan_codes, True)] = -2 + + not_found = codes[values == -2] + if len(not_found) != 0 and errors != "ignore": + raise KeyError(f"labels {not_found} not found in level") + mask = ~algos.isin(self.codes[i], values) + + return self[mask] + + def swaplevel(self, i=-2, j=-1) -> MultiIndex: + """ + Swap level i with level j. + + Calling this method does not change the ordering of the values. + + Parameters + ---------- + i : int, str, default -2 + First level of index to be swapped. Can pass level name as string. + Type of parameters can be mixed. + j : int, str, default -1 + Second level of index to be swapped. Can pass level name as string. + Type of parameters can be mixed. + + Returns + ------- + MultiIndex + A new MultiIndex. + + See Also + -------- + Series.swaplevel : Swap levels i and j in a MultiIndex. + DataFrame.swaplevel : Swap levels i and j in a MultiIndex on a + particular axis. + + Examples + -------- + >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> mi + MultiIndex([('a', 'bb'), + ('a', 'aa'), + ('b', 'bb'), + ('b', 'aa')], + ) + >>> mi.swaplevel(0, 1) + MultiIndex([('bb', 'a'), + ('aa', 'a'), + ('bb', 'b'), + ('aa', 'b')], + ) + """ + new_levels = list(self.levels) + new_codes = list(self.codes) + new_names = list(self.names) + + i = self._get_level_number(i) + j = self._get_level_number(j) + + new_levels[i], new_levels[j] = new_levels[j], new_levels[i] + new_codes[i], new_codes[j] = new_codes[j], new_codes[i] + new_names[i], new_names[j] = new_names[j], new_names[i] + + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) + + def reorder_levels(self, order) -> MultiIndex: + """ + Rearrange levels using input order. May not drop or duplicate levels. + + Parameters + ---------- + order : list of int or list of str + List representing new level order. Reference level by number + (position) or by key (label). + + Returns + ------- + MultiIndex + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=['x', 'y']) + >>> mi + MultiIndex([(1, 3), + (2, 4)], + names=['x', 'y']) + + >>> mi.reorder_levels(order=[1, 0]) + MultiIndex([(3, 1), + (4, 2)], + names=['y', 'x']) + + >>> mi.reorder_levels(order=['y', 'x']) + MultiIndex([(3, 1), + (4, 2)], + names=['y', 'x']) + """ + order = [self._get_level_number(i) for i in order] + if len(order) != self.nlevels: + raise AssertionError( + f"Length of order must be same as number of levels ({self.nlevels}), " + f"got {len(order)}" + ) + new_levels = [self.levels[i] for i in order] + new_codes = [self.codes[i] for i in order] + new_names = [self.names[i] for i in order] + + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) + + def _get_codes_for_sorting(self) -> list[Categorical]: + """ + we are categorizing our codes by using the + available categories (all, not just observed) + excluding any missing ones (-1); this is in preparation + for sorting, where we need to disambiguate that -1 is not + a valid valid + """ + + def cats(level_codes): + return np.arange( + np.array(level_codes).max() + 1 if len(level_codes) else 0, + dtype=level_codes.dtype, + ) + + return [ + Categorical.from_codes(level_codes, cats(level_codes), ordered=True) + for level_codes in self.codes + ] + + def sortlevel( + self, level=0, ascending: bool = True, sort_remaining: bool = True + ) -> tuple[MultiIndex, npt.NDArray[np.intp]]: + """ + Sort MultiIndex at the requested level. + + The result will respect the original ordering of the associated + factor at that level. + + Parameters + ---------- + level : list-like, int or str, default 0 + If a string is given, must be a name of the level. + If list-like must be names or ints of levels. + ascending : bool, default True + False to sort in descending order. + Can also be a list to specify a directed ordering. + sort_remaining : sort by the remaining levels after level + + Returns + ------- + sorted_index : pd.MultiIndex + Resulting index. + indexer : np.ndarray[np.intp] + Indices of output values in original index. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([[0, 0], [2, 1]]) + >>> mi + MultiIndex([(0, 2), + (0, 1)], + ) + + >>> mi.sortlevel() + (MultiIndex([(0, 1), + (0, 2)], + ), array([1, 0])) + + >>> mi.sortlevel(sort_remaining=False) + (MultiIndex([(0, 2), + (0, 1)], + ), array([0, 1])) + + >>> mi.sortlevel(1) + (MultiIndex([(0, 1), + (0, 2)], + ), array([1, 0])) + + >>> mi.sortlevel(1, ascending=False) + (MultiIndex([(0, 2), + (0, 1)], + ), array([0, 1])) + """ + if isinstance(level, (str, int)): + level = [level] + level = [self._get_level_number(lev) for lev in level] + sortorder = None + + # we have a directed ordering via ascending + if isinstance(ascending, list): + if not len(level) == len(ascending): + raise ValueError("level must have same length as ascending") + + indexer = lexsort_indexer( + [self.codes[lev] for lev in level], orders=ascending + ) + + # level ordering + else: + + codes = list(self.codes) + shape = list(self.levshape) + + # partition codes and shape + primary = tuple(codes[lev] for lev in level) + primshp = tuple(shape[lev] for lev in level) + + # Reverse sorted to retain the order of + # smaller indices that needs to be removed + for lev in sorted(level, reverse=True): + codes.pop(lev) + shape.pop(lev) + + if sort_remaining: + primary += primary + tuple(codes) + primshp += primshp + tuple(shape) + else: + sortorder = level[0] + + indexer = indexer_from_factorized(primary, primshp, compress=False) + + if not ascending: + indexer = indexer[::-1] + + indexer = ensure_platform_int(indexer) + new_codes = [level_codes.take(indexer) for level_codes in self.codes] + + new_index = MultiIndex( + codes=new_codes, + levels=self.levels, + names=self.names, + sortorder=sortorder, + verify_integrity=False, + ) + + return new_index, indexer + + def _wrap_reindex_result(self, target, indexer, preserve_names: bool): + if not isinstance(target, MultiIndex): + if indexer is None: + target = self + elif (indexer >= 0).all(): + target = self.take(indexer) + else: + try: + target = MultiIndex.from_tuples(target) + except TypeError: + # not all tuples, see test_constructor_dict_multiindex_reindex_flat + return target + + target = self._maybe_preserve_names(target, preserve_names) + return target + + def _maybe_preserve_names(self, target: Index, preserve_names: bool) -> Index: + if ( + preserve_names + and target.nlevels == self.nlevels + and target.names != self.names + ): + target = target.copy(deep=False) + target.names = self.names + return target + + # -------------------------------------------------------------------- + # Indexing Methods + + def _check_indexing_error(self, key) -> None: + if not is_hashable(key) or is_iterator(key): + # We allow tuples if they are hashable, whereas other Index + # subclasses require scalar. + # We have to explicitly exclude generators, as these are hashable. + raise InvalidIndexError(key) + + @cache_readonly + def _should_fallback_to_positional(self) -> bool: + """ + Should integer key(s) be treated as positional? + """ + # GH#33355 + return self.levels[0]._should_fallback_to_positional + + def _get_values_for_loc(self, series: Series, loc, key): + """ + Do a positional lookup on the given Series, returning either a scalar + or a Series. + + Assumes that `series.index is self` + """ + new_values = series._values[loc] + if is_scalar(loc): + return new_values + + if len(new_values) == 1 and not self.nlevels > 1: + # If more than one level left, we can not return a scalar + return new_values[0] + + new_index = self[loc] + new_index = maybe_droplevels(new_index, key) + new_ser = series._constructor(new_values, index=new_index, name=series.name) + return new_ser.__finalize__(series) + + def _get_indexer_strict( + self, key, axis_name: str + ) -> tuple[Index, npt.NDArray[np.intp]]: + + keyarr = key + if not isinstance(keyarr, Index): + keyarr = com.asarray_tuplesafe(keyarr) + + if len(keyarr) and not isinstance(keyarr[0], tuple): + indexer = self._get_indexer_level_0(keyarr) + + self._raise_if_missing(key, indexer, axis_name) + return self[indexer], indexer + + return super()._get_indexer_strict(key, axis_name) + + def _raise_if_missing(self, key, indexer, axis_name: str) -> None: + keyarr = key + if not isinstance(key, Index): + keyarr = com.asarray_tuplesafe(key) + + if len(keyarr) and not isinstance(keyarr[0], tuple): + # i.e. same condition for special case in MultiIndex._get_indexer_strict + + mask = indexer == -1 + if mask.any(): + check = self.levels[0].get_indexer(keyarr) + cmask = check == -1 + if cmask.any(): + raise KeyError(f"{keyarr[cmask]} not in index") + # We get here when levels still contain values which are not + # actually in Index anymore + raise KeyError(f"{keyarr} not in index") + else: + return super()._raise_if_missing(key, indexer, axis_name) + + def _get_indexer_level_0(self, target) -> npt.NDArray[np.intp]: + """ + Optimized equivalent to `self.get_level_values(0).get_indexer_for(target)`. + """ + lev = self.levels[0] + codes = self._codes[0] + cat = Categorical.from_codes(codes=codes, categories=lev) + ci = Index(cat) + return ci.get_indexer_for(target) + + def get_slice_bound( + self, label: Hashable | Sequence[Hashable], side: str, kind=lib.no_default + ) -> int: + """ + For an ordered MultiIndex, compute slice bound + that corresponds to given label. + + Returns leftmost (one-past-the-rightmost if `side=='right') position + of given label. + + Parameters + ---------- + label : object or tuple of objects + side : {'left', 'right'} + kind : {'loc', 'getitem', None} + + .. deprecated:: 1.4.0 + + Returns + ------- + int + Index of label. + + Notes + ----- + This method only works if level 0 index of the MultiIndex is lexsorted. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abbc'), list('gefd')]) + + Get the locations from the leftmost 'b' in the first level + until the end of the multiindex: + + >>> mi.get_slice_bound('b', side="left") + 1 + + Like above, but if you get the locations from the rightmost + 'b' in the first level and 'f' in the second level: + + >>> mi.get_slice_bound(('b','f'), side="right") + 3 + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + """ + self._deprecated_arg(kind, "kind", "get_slice_bound") + + if not isinstance(label, tuple): + label = (label,) + return self._partial_tup_index(label, side=side) + + def slice_locs( + self, start=None, end=None, step=None, kind=lib.no_default + ) -> tuple[int, int]: + """ + For an ordered MultiIndex, compute the slice locations for input + labels. + + The input labels can be tuples representing partial levels, e.g. for a + MultiIndex with 3 levels, you can pass a single value (corresponding to + the first level), or a 1-, 2-, or 3-tuple. + + Parameters + ---------- + start : label or tuple, default None + If None, defaults to the beginning + end : label or tuple + If None, defaults to the end + step : int or None + Slice step + kind : string, optional, defaults None + + .. deprecated:: 1.4.0 + + Returns + ------- + (start, end) : (int, int) + + Notes + ----- + This method only works if the MultiIndex is properly lexsorted. So, + if only the first 2 levels of a 3-level MultiIndex are lexsorted, + you can only pass two levels to ``.slice_locs``. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abbd'), list('deff')], + ... names=['A', 'B']) + + Get the slice locations from the beginning of 'b' in the first level + until the end of the multiindex: + + >>> mi.slice_locs(start='b') + (1, 4) + + Like above, but stop at the end of 'b' in the first level and 'f' in + the second level: + + >>> mi.slice_locs(start='b', end=('b', 'f')) + (1, 3) + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + """ + self._deprecated_arg(kind, "kind", "slice_locs") + # This function adds nothing to its parent implementation (the magic + # happens in get_slice_bound method), but it adds meaningful doc. + return super().slice_locs(start, end, step) + + def _partial_tup_index(self, tup: tuple, side="left"): + if len(tup) > self._lexsort_depth: + raise UnsortedIndexError( + f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth " + f"({self._lexsort_depth})" + ) + + n = len(tup) + start, end = 0, len(self) + zipped = zip(tup, self.levels, self.codes) + for k, (lab, lev, level_codes) in enumerate(zipped): + section = level_codes[start:end] + + if lab not in lev and not isna(lab): + # short circuit + try: + loc = algos.searchsorted(lev, lab, side=side) + except TypeError as err: + # non-comparable e.g. test_slice_locs_with_type_mismatch + raise TypeError(f"Level type mismatch: {lab}") from err + if not is_integer(loc): + # non-comparable level, e.g. test_groupby_example + raise TypeError(f"Level type mismatch: {lab}") + if side == "right" and loc >= 0: + loc -= 1 + return start + algos.searchsorted(section, loc, side=side) + + idx = self._get_loc_single_level_index(lev, lab) + if isinstance(idx, slice) and k < n - 1: + # Get start and end value from slice, necessary when a non-integer + # interval is given as input GH#37707 + start = idx.start + end = idx.stop + elif k < n - 1: + # error: Incompatible types in assignment (expression has type + # "Union[ndarray[Any, dtype[signedinteger[Any]]] + end = start + algos.searchsorted( # type: ignore[assignment] + section, idx, side="right" + ) + # error: Incompatible types in assignment (expression has type + # "Union[ndarray[Any, dtype[signedinteger[Any]]] + start = start + algos.searchsorted( # type: ignore[assignment] + section, idx, side="left" + ) + elif isinstance(idx, slice): + idx = idx.start + return start + algos.searchsorted(section, idx, side=side) + else: + return start + algos.searchsorted(section, idx, side=side) + + def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int: + """ + If key is NA value, location of index unify as -1. + + Parameters + ---------- + level_index: Index + key : label + + Returns + ------- + loc : int + If key is NA value, loc is -1 + Else, location of key in index. + + See Also + -------- + Index.get_loc : The get_loc method for (single-level) index. + """ + if is_scalar(key) and isna(key): + return -1 + else: + return level_index.get_loc(key) + + def get_loc(self, key, method=None): + """ + Get location for a label or a tuple of labels. + + The location is returned as an integer/slice or boolean + mask. + + Parameters + ---------- + key : label or tuple of labels (one for each level) + method : None + + Returns + ------- + loc : int, slice object or boolean mask + If the key is past the lexsort depth, the return may be a + boolean mask array, otherwise it is always a slice or int. + + See Also + -------- + Index.get_loc : The get_loc method for (single-level) index. + MultiIndex.slice_locs : Get slice location given start label(s) and + end label(s). + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + + Notes + ----- + The key cannot be a slice, list of same-level labels, a boolean mask, + or a sequence of such. If you want to use those, use + :meth:`MultiIndex.get_locs` instead. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')]) + + >>> mi.get_loc('b') + slice(1, 3, None) + + >>> mi.get_loc(('b', 'e')) + 1 + """ + if method is not None: + raise NotImplementedError( + "only the default get_loc method is " + "currently supported for MultiIndex" + ) + + self._check_indexing_error(key) + + def _maybe_to_slice(loc): + """convert integer indexer to boolean mask or slice if possible""" + if not isinstance(loc, np.ndarray) or loc.dtype != np.intp: + return loc + + loc = lib.maybe_indices_to_slice(loc, len(self)) + if isinstance(loc, slice): + return loc + + mask = np.empty(len(self), dtype="bool") + mask.fill(False) + mask[loc] = True + return mask + + if not isinstance(key, tuple): + loc = self._get_level_indexer(key, level=0) + return _maybe_to_slice(loc) + + keylen = len(key) + if self.nlevels < keylen: + raise KeyError( + f"Key length ({keylen}) exceeds index depth ({self.nlevels})" + ) + + if keylen == self.nlevels and self.is_unique: + try: + return self._engine.get_loc(key) + except TypeError: + # e.g. test_partial_slicing_with_multiindex partial string slicing + loc, _ = self.get_loc_level(key, list(range(self.nlevels))) + return loc + + # -- partial selection or non-unique index + # break the key into 2 parts based on the lexsort_depth of the index; + # the first part returns a continuous slice of the index; the 2nd part + # needs linear search within the slice + i = self._lexsort_depth + lead_key, follow_key = key[:i], key[i:] + + if not lead_key: + start = 0 + stop = len(self) + else: + try: + start, stop = self.slice_locs(lead_key, lead_key) + except TypeError as err: + # e.g. test_groupby_example key = ((0, 0, 1, 2), "new_col") + # when self has 5 integer levels + raise KeyError(key) from err + + if start == stop: + raise KeyError(key) + + if not follow_key: + return slice(start, stop) + + warnings.warn( + "indexing past lexsort depth may impact performance.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) + + loc = np.arange(start, stop, dtype=np.intp) + + for i, k in enumerate(follow_key, len(lead_key)): + mask = self.codes[i][loc] == self._get_loc_single_level_index( + self.levels[i], k + ) + if not mask.all(): + loc = loc[mask] + if not len(loc): + raise KeyError(key) + + return _maybe_to_slice(loc) if len(loc) != stop - start else slice(start, stop) + + def get_loc_level(self, key, level=0, drop_level: bool = True): + """ + Get location and sliced index for requested label(s)/level(s). + + Parameters + ---------- + key : label or sequence of labels + level : int/level name or list thereof, optional + drop_level : bool, default True + If ``False``, the resulting index will not drop any level. + + Returns + ------- + loc : A 2-tuple where the elements are: + Element 0: int, slice object or boolean array + Element 1: The resulting sliced multiindex/index. If the key + contains all levels, this will be ``None``. + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')], + ... names=['A', 'B']) + + >>> mi.get_loc_level('b') + (slice(1, 3, None), Index(['e', 'f'], dtype='object', name='B')) + + >>> mi.get_loc_level('e', level='B') + (array([False, True, False]), Index(['b'], dtype='object', name='A')) + + >>> mi.get_loc_level(['b', 'e']) + (1, None) + """ + if not isinstance(level, (list, tuple)): + level = self._get_level_number(level) + else: + level = [self._get_level_number(lev) for lev in level] + + loc, mi = self._get_loc_level(key, level=level) + if not drop_level: + if lib.is_integer(loc): + mi = self[loc : loc + 1] + else: + mi = self[loc] + return loc, mi + + def _get_loc_level(self, key, level: int | list[int] = 0): + """ + get_loc_level but with `level` known to be positional, not name-based. + """ + + # different name to distinguish from maybe_droplevels + def maybe_mi_droplevels(indexer, levels): + """ + If level does not exist or all levels were dropped, the exception + has to be handled outside. + """ + new_index = self[indexer] + + for i in sorted(levels, reverse=True): + new_index = new_index._drop_level_numbers([i]) + + return new_index + + if isinstance(level, (tuple, list)): + if len(key) != len(level): + raise AssertionError( + "Key for location must have same length as number of levels" + ) + result = None + for lev, k in zip(level, key): + loc, new_index = self._get_loc_level(k, level=lev) + if isinstance(loc, slice): + mask = np.zeros(len(self), dtype=bool) + mask[loc] = True + loc = mask + result = loc if result is None else result & loc + + try: + # FIXME: we should be only dropping levels on which we are + # scalar-indexing + mi = maybe_mi_droplevels(result, level) + except ValueError: + # droplevel failed because we tried to drop all levels, + # i.e. len(level) == self.nlevels + mi = self[result] + + return result, mi + + # kludge for #1796 + if isinstance(key, list): + key = tuple(key) + + if isinstance(key, tuple) and level == 0: + + try: + # Check if this tuple is a single key in our first level + if key in self.levels[0]: + indexer = self._get_level_indexer(key, level=level) + new_index = maybe_mi_droplevels(indexer, [0]) + return indexer, new_index + except (TypeError, InvalidIndexError): + pass + + if not any(isinstance(k, slice) for k in key): + + if len(key) == self.nlevels and self.is_unique: + # Complete key in unique index -> standard get_loc + try: + return (self._engine.get_loc(key), None) + except KeyError as err: + raise KeyError(key) from err + except TypeError: + # e.g. partial string indexing + # test_partial_string_timestamp_multiindex + pass + + # partial selection + indexer = self.get_loc(key) + ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)] + if len(ilevels) == self.nlevels: + if is_integer(indexer): + # we are dropping all levels + return indexer, None + + # TODO: in some cases we still need to drop some levels, + # e.g. test_multiindex_perf_warn + # test_partial_string_timestamp_multiindex + ilevels = [ + i + for i in range(len(key)) + if ( + not isinstance(key[i], str) + or not self.levels[i]._supports_partial_string_indexing + ) + and key[i] != slice(None, None) + ] + if len(ilevels) == self.nlevels: + # TODO: why? + ilevels = [] + return indexer, maybe_mi_droplevels(indexer, ilevels) + + else: + indexer = None + for i, k in enumerate(key): + if not isinstance(k, slice): + loc_level = self._get_level_indexer(k, level=i) + if isinstance(loc_level, slice): + if com.is_null_slice(loc_level) or com.is_full_slice( + loc_level, len(self) + ): + # everything + continue + else: + # e.g. test_xs_IndexSlice_argument_not_implemented + k_index = np.zeros(len(self), dtype=bool) + k_index[loc_level] = True + + else: + k_index = loc_level + + elif com.is_null_slice(k): + # taking everything, does not affect `indexer` below + continue + + else: + # FIXME: this message can be inaccurate, e.g. + # test_series_varied_multiindex_alignment + raise TypeError(f"Expected label or tuple of labels, got {key}") + + if indexer is None: + indexer = k_index + else: + indexer &= k_index + if indexer is None: + indexer = slice(None, None) + ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)] + return indexer, maybe_mi_droplevels(indexer, ilevels) + else: + indexer = self._get_level_indexer(key, level=level) + if ( + isinstance(key, str) + and self.levels[level]._supports_partial_string_indexing + ): + # check to see if we did an exact lookup vs sliced + check = self.levels[level].get_loc(key) + if not is_integer(check): + # e.g. test_partial_string_timestamp_multiindex + return indexer, self[indexer] + + try: + result_index = maybe_mi_droplevels(indexer, [level]) + except ValueError: + result_index = self[indexer] + + return indexer, result_index + + def _get_level_indexer( + self, key, level: int = 0, indexer: Int64Index | None = None + ): + # `level` kwarg is _always_ positional, never name + # return an indexer, boolean array or a slice showing where the key is + # in the totality of values + # if the indexer is provided, then use this + + level_index = self.levels[level] + level_codes = self.codes[level] + + def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): + # given the inputs and the codes/indexer, compute an indexer set + # if we have a provided indexer, then this need not consider + # the entire labels set + r = np.arange(start, stop, step) + + if indexer is not None and len(indexer) != len(codes): + + # we have an indexer which maps the locations in the labels + # that we have already selected (and is not an indexer for the + # entire set) otherwise this is wasteful so we only need to + # examine locations that are in this set the only magic here is + # that the result are the mappings to the set that we have + # selected + from pandas import Series + + mapper = Series(indexer) + indexer = codes.take(ensure_platform_int(indexer)) + result = Series(Index(indexer).isin(r).nonzero()[0]) + m = result.map(mapper) + # error: Incompatible types in assignment (expression has type + # "ndarray", variable has type "Series") + m = np.asarray(m) # type: ignore[assignment] + + else: + # error: Incompatible types in assignment (expression has type + # "ndarray", variable has type "Series") + m = np.zeros(len(codes), dtype=bool) # type: ignore[assignment] + m[np.in1d(codes, r, assume_unique=Index(codes).is_unique)] = True + + return m + + if isinstance(key, slice): + # handle a slice, returning a slice if we can + # otherwise a boolean indexer + step = key.step + is_negative_step = step is not None and step < 0 + + try: + if key.start is not None: + start = level_index.get_loc(key.start) + elif is_negative_step: + start = len(level_index) - 1 + else: + start = 0 + + if key.stop is not None: + stop = level_index.get_loc(key.stop) + elif is_negative_step: + stop = 0 + elif isinstance(start, slice): + stop = len(level_index) + else: + stop = len(level_index) - 1 + except KeyError: + + # we have a partial slice (like looking up a partial date + # string) + start = stop = level_index.slice_indexer(key.start, key.stop, key.step) + step = start.step + + if isinstance(start, slice) or isinstance(stop, slice): + # we have a slice for start and/or stop + # a partial date slicer on a DatetimeIndex generates a slice + # note that the stop ALREADY includes the stopped point (if + # it was a string sliced) + start = getattr(start, "start", start) + stop = getattr(stop, "stop", stop) + return convert_indexer(start, stop, step) + + elif level > 0 or self._lexsort_depth == 0 or step is not None: + # need to have like semantics here to right + # searching as when we are using a slice + # so adjust the stop by 1 (so we include stop) + stop = (stop - 1) if is_negative_step else (stop + 1) + return convert_indexer(start, stop, step) + else: + # sorted, so can return slice object -> view + i = algos.searchsorted(level_codes, start, side="left") + j = algos.searchsorted(level_codes, stop, side="right") + return slice(i, j, step) + + else: + + idx = self._get_loc_single_level_index(level_index, key) + + if level > 0 or self._lexsort_depth == 0: + # Desired level is not sorted + if isinstance(idx, slice): + # test_get_loc_partial_timestamp_multiindex + locs = (level_codes >= idx.start) & (level_codes < idx.stop) + return locs + + locs = np.array(level_codes == idx, dtype=bool, copy=False) + + if not locs.any(): + # The label is present in self.levels[level] but unused: + raise KeyError(key) + return locs + + if isinstance(idx, slice): + # e.g. test_partial_string_timestamp_multiindex + start = algos.searchsorted(level_codes, idx.start, side="left") + # NB: "left" here bc of slice semantics + end = algos.searchsorted(level_codes, idx.stop, side="left") + else: + start = algos.searchsorted(level_codes, idx, side="left") + end = algos.searchsorted(level_codes, idx, side="right") + + if start == end: + # The label is present in self.levels[level] but unused: + raise KeyError(key) + return slice(start, end) + + def get_locs(self, seq): + """ + Get location for a sequence of labels. + + Parameters + ---------- + seq : label, slice, list, mask or a sequence of such + You should use one of the above for each level. + If a level should not be used, set it to ``slice(None)``. + + Returns + ------- + numpy.ndarray + NumPy array of integers suitable for passing to iloc. + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.slice_locs : Get slice location given start label(s) and + end label(s). + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')]) + + >>> mi.get_locs('b') # doctest: +SKIP + array([1, 2], dtype=int64) + + >>> mi.get_locs([slice(None), ['e', 'f']]) # doctest: +SKIP + array([1, 2], dtype=int64) + + >>> mi.get_locs([[True, False, True], slice('e', 'f')]) # doctest: +SKIP + array([2], dtype=int64) + """ + + # must be lexsorted to at least as many levels + true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] + if true_slices and true_slices[-1] >= self._lexsort_depth: + raise UnsortedIndexError( + "MultiIndex slicing requires the index to be lexsorted: slicing " + f"on levels {true_slices}, lexsort depth {self._lexsort_depth}" + ) + + n = len(self) + # indexer is the list of all positions that we want to take; it + # is created on the first entry in seq and narrowed down as we + # look at remaining entries + indexer = None + + if any(x is Ellipsis for x in seq): + raise NotImplementedError( + "MultiIndex does not support indexing with Ellipsis" + ) + + def _convert_to_indexer(r) -> Int64Index: + # return an indexer + if isinstance(r, slice): + m = np.zeros(n, dtype=bool) + m[r] = True + r = m.nonzero()[0] + elif com.is_bool_indexer(r): + if len(r) != n: + raise ValueError( + "cannot index with a boolean indexer " + "that is not the same length as the " + "index" + ) + r = r.nonzero()[0] + return Int64Index(r) + + def _update_indexer(idxr: Index, indexer: Index | None) -> Index: + if indexer is None: + return idxr + indexer_intersection = indexer.intersection(idxr) + if indexer_intersection.empty and not idxr.empty and not indexer.empty: + raise KeyError(seq) + return indexer_intersection + + for i, k in enumerate(seq): + + if com.is_bool_indexer(k): + # a boolean indexer, must be the same length! + k = np.asarray(k) + lvl_indexer = _convert_to_indexer(k) + indexer = _update_indexer(lvl_indexer, indexer=indexer) + + elif is_list_like(k): + # a collection of labels to include from this level (these + # are or'd) + + indexers: Int64Index | None = None + + # GH#27591 check if this is a single tuple key in the level + try: + # Argument "indexer" to "_get_level_indexer" of "MultiIndex" + # has incompatible type "Index"; expected "Optional[Int64Index]" + lev_loc = self._get_level_indexer( + k, level=i, indexer=indexer # type: ignore[arg-type] + ) + except (InvalidIndexError, TypeError, KeyError) as err: + # InvalidIndexError e.g. non-hashable, fall back to treating + # this as a sequence of labels + # KeyError it can be ambiguous if this is a label or sequence + # of labels + # github.com/pandas-dev/pandas/issues/39424#issuecomment-871626708 + for x in k: + if not is_hashable(x): + # e.g. slice + raise err + try: + # Argument "indexer" to "_get_level_indexer" of "MultiIndex" + # has incompatible type "Index"; expected + # "Optional[Int64Index]" + item_lvl_indexer = self._get_level_indexer( + x, level=i, indexer=indexer # type: ignore[arg-type] + ) + except KeyError: + # ignore not founds; see discussion in GH#39424 + warnings.warn( + "The behavior of indexing on a MultiIndex with a " + "nested sequence of labels is deprecated and will " + "change in a future version. " + "`series.loc[label, sequence]` will raise if any " + "members of 'sequence' or not present in " + "the index's second level. To retain the old " + "behavior, use `series.index.isin(sequence, level=1)`", + # TODO: how to opt in to the future behavior? + # TODO: how to handle IntervalIndex level? + # (no test cases) + FutureWarning, + stacklevel=find_stack_level(), + ) + continue + else: + idxrs = _convert_to_indexer(item_lvl_indexer) + + if indexers is None: + indexers = idxrs + else: + indexers = indexers.union(idxrs, sort=False) + + else: + idxrs = _convert_to_indexer(lev_loc) + if indexers is None: + indexers = idxrs + else: + indexers = indexers.union(idxrs, sort=False) + + if indexers is not None: + indexer = _update_indexer(indexers, indexer=indexer) + else: + # no matches we are done + # test_loc_getitem_duplicates_multiindex_empty_indexer + return np.array([], dtype=np.intp) + + elif com.is_null_slice(k): + # empty slice + if indexer is None: + indexer = Index(np.arange(n)) + + elif isinstance(k, slice): + + # a slice, include BOTH of the labels + # Argument "indexer" to "_get_level_indexer" of "MultiIndex" has + # incompatible type "Index"; expected "Optional[Int64Index]" + lvl_indexer = self._get_level_indexer( + k, + level=i, + indexer=indexer, # type: ignore[arg-type] + ) + indexer = _update_indexer( + _convert_to_indexer(lvl_indexer), + indexer=indexer, + ) + else: + # a single label + lvl_indexer = self._get_loc_level(k, level=i)[0] + indexer = _update_indexer( + _convert_to_indexer(lvl_indexer), + indexer=indexer, + ) + + # empty indexer + if indexer is None: + return np.array([], dtype=np.intp) + + assert isinstance(indexer, Int64Index), type(indexer) + indexer = self._reorder_indexer(seq, indexer) + + return indexer._values.astype(np.intp, copy=False) + + # -------------------------------------------------------------------- + + def _reorder_indexer( + self, + seq: tuple[Scalar | Iterable | AnyArrayLike, ...], + indexer: Int64Index, + ) -> Int64Index: + """ + Reorder an indexer of a MultiIndex (self) so that the label are in the + same order as given in seq + + Parameters + ---------- + seq : label/slice/list/mask or a sequence of such + indexer: an Int64Index indexer of self + + Returns + ------- + indexer : a sorted Int64Index indexer of self ordered as seq + """ + # If the index is lexsorted and the list_like label in seq are sorted + # then we do not need to sort + if self._is_lexsorted(): + need_sort = False + for i, k in enumerate(seq): + if is_list_like(k): + if not need_sort: + k_codes = self.levels[i].get_indexer(k) + k_codes = k_codes[k_codes >= 0] # Filter absent keys + # True if the given codes are not ordered + need_sort = (k_codes[:-1] > k_codes[1:]).any() + elif isinstance(k, slice) and k.step is not None and k.step < 0: + need_sort = True + # Bail out if both index and seq are sorted + if not need_sort: + return indexer + + n = len(self) + keys: tuple[np.ndarray, ...] = () + # For each level of the sequence in seq, map the level codes with the + # order they appears in a list-like sequence + # This mapping is then use to reorder the indexer + for i, k in enumerate(seq): + if is_scalar(k): + # GH#34603 we want to treat a scalar the same as an all equal list + k = [k] + if com.is_bool_indexer(k): + new_order = np.arange(n)[indexer] + elif is_list_like(k): + # Generate a map with all level codes as sorted initially + k = algos.unique(k) + key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len( + self.levels[i] + ) + # Set order as given in the indexer list + level_indexer = self.levels[i].get_indexer(k) + level_indexer = level_indexer[level_indexer >= 0] # Filter absent keys + key_order_map[level_indexer] = np.arange(len(level_indexer)) + + new_order = key_order_map[self.codes[i][indexer]] + elif isinstance(k, slice) and k.step is not None and k.step < 0: + # flip order for negative step + new_order = np.arange(n)[::-1][indexer] + elif isinstance(k, slice) and k.start is None and k.stop is None: + # slice(None) should not determine order GH#31330 + new_order = np.ones((n,))[indexer] + else: + # For all other case, use the same order as the level + new_order = np.arange(n)[indexer] + keys = (new_order,) + keys + + # Find the reordering using lexsort on the keys mapping + ind = np.lexsort(keys) + return indexer[ind] + + def truncate(self, before=None, after=None) -> MultiIndex: + """ + Slice index between two labels / tuples, return new MultiIndex + + Parameters + ---------- + before : label or tuple, can be partial. Default None + None defaults to start + after : label or tuple, can be partial. Default None + None defaults to end + + Returns + ------- + truncated : MultiIndex + """ + if after and before and after < before: + raise ValueError("after < before") + + i, j = self.levels[0].slice_locs(before, after) + left, right = self.slice_locs(before, after) + + new_levels = list(self.levels) + new_levels[0] = new_levels[0][i:j] + + new_codes = [level_codes[left:right] for level_codes in self.codes] + new_codes[0] = new_codes[0] - i + + return MultiIndex( + levels=new_levels, + codes=new_codes, + names=self._names, + verify_integrity=False, + ) + + def equals(self, other: object) -> bool: + """ + Determines if two MultiIndex objects have the same labeling information + (the levels themselves do not necessarily have to be the same) + + See Also + -------- + equal_levels + """ + if self.is_(other): + return True + + if not isinstance(other, Index): + return False + + if len(self) != len(other): + return False + + if not isinstance(other, MultiIndex): + # d-level MultiIndex can equal d-tuple Index + if not self._should_compare(other): + # object Index or Categorical[object] may contain tuples + return False + return array_equivalent(self._values, other._values) + + if self.nlevels != other.nlevels: + return False + + for i in range(self.nlevels): + self_codes = self.codes[i] + other_codes = other.codes[i] + self_mask = self_codes == -1 + other_mask = other_codes == -1 + if not np.array_equal(self_mask, other_mask): + return False + self_codes = self_codes[~self_mask] + self_values = self.levels[i]._values.take(self_codes) + + other_codes = other_codes[~other_mask] + other_values = other.levels[i]._values.take(other_codes) + + # since we use NaT both datetime64 and timedelta64 we can have a + # situation where a level is typed say timedelta64 in self (IOW it + # has other values than NaT) but types datetime64 in other (where + # its all NaT) but these are equivalent + if len(self_values) == 0 and len(other_values) == 0: + continue + + if not isinstance(self_values, np.ndarray): + # i.e. ExtensionArray + if not self_values.equals(other_values): + return False + elif not isinstance(other_values, np.ndarray): + # i.e. other is ExtensionArray + if not other_values.equals(self_values): + return False + else: + if not array_equivalent(self_values, other_values): + return False + + return True + + def equal_levels(self, other: MultiIndex) -> bool: + """ + Return True if the levels of both MultiIndex objects are the same + + """ + if self.nlevels != other.nlevels: + return False + + for i in range(self.nlevels): + if not self.levels[i].equals(other.levels[i]): + return False + return True + + # -------------------------------------------------------------------- + # Set Methods + + def _union(self, other, sort) -> MultiIndex: + other, result_names = self._convert_can_do_setop(other) + if ( + any(-1 in code for code in self.codes) + and any(-1 in code for code in other.codes) + or self.has_duplicates + or other.has_duplicates + ): + # This is only necessary if both sides have nans or one has dups, + # fast_unique_multiple is faster + result = super()._union(other, sort) + else: + rvals = other._values.astype(object, copy=False) + result = lib.fast_unique_multiple([self._values, rvals], sort=sort) + + return MultiIndex.from_arrays(zip(*result), sortorder=None, names=result_names) + + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + return is_object_dtype(dtype) + + def _get_reconciled_name_object(self, other) -> MultiIndex: + """ + If the result of a set operation will be self, + return self, unless the names change, in which + case make a shallow copy of self. + """ + names = self._maybe_match_names(other) + if self.names != names: + # Incompatible return value type (got "Optional[MultiIndex]", expected + # "MultiIndex") + return self.rename(names) # type: ignore[return-value] + return self + + def _maybe_match_names(self, other): + """ + Try to find common names to attach to the result of an operation between + a and b. Return a consensus list of names if they match at least partly + or list of None if they have completely different names. + """ + if len(self.names) != len(other.names): + return [None] * len(self.names) + names = [] + for a_name, b_name in zip(self.names, other.names): + if a_name == b_name: + names.append(a_name) + else: + # TODO: what if they both have np.nan for their names? + names.append(None) + return names + + def _wrap_intersection_result(self, other, result) -> MultiIndex: + _, result_names = self._convert_can_do_setop(other) + + if len(result) == 0: + return MultiIndex( + levels=self.levels, + codes=[[]] * self.nlevels, + names=result_names, + verify_integrity=False, + ) + else: + return MultiIndex.from_arrays(zip(*result), sortorder=0, names=result_names) + + def _wrap_difference_result(self, other, result) -> MultiIndex: + _, result_names = self._convert_can_do_setop(other) + + if len(result) == 0: + return MultiIndex( + levels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, + names=result_names, + verify_integrity=False, + ) + else: + return MultiIndex.from_tuples(result, sortorder=0, names=result_names) + + def _convert_can_do_setop(self, other): + result_names = self.names + + if not isinstance(other, Index): + + if len(other) == 0: + return self[:0], self.names + else: + msg = "other must be a MultiIndex or a list of tuples" + try: + other = MultiIndex.from_tuples(other, names=self.names) + except (ValueError, TypeError) as err: + # ValueError raised by tuples_to_object_array if we + # have non-object dtype + raise TypeError(msg) from err + else: + result_names = get_unanimous_names(self, other) + + return other, result_names + + # -------------------------------------------------------------------- + + @doc(Index.astype) + def astype(self, dtype, copy: bool = True): + dtype = pandas_dtype(dtype) + if is_categorical_dtype(dtype): + msg = "> 1 ndim Categorical are not supported at this time" + raise NotImplementedError(msg) + elif not is_object_dtype(dtype): + raise TypeError( + "Setting a MultiIndex dtype to anything other than object " + "is not supported" + ) + elif copy is True: + return self._view() + return self + + def _validate_fill_value(self, item): + if isinstance(item, MultiIndex): + # GH#43212 + if item.nlevels != self.nlevels: + raise ValueError("Item must have length equal to number of levels.") + return item._values + elif not isinstance(item, tuple): + # Pad the key with empty strings if lower levels of the key + # aren't specified: + item = (item,) + ("",) * (self.nlevels - 1) + elif len(item) != self.nlevels: + raise ValueError("Item must have length equal to number of levels.") + return item + + def insert(self, loc: int, item) -> MultiIndex: + """ + Make new MultiIndex inserting new item at location + + Parameters + ---------- + loc : int + item : tuple + Must be same length as number of levels in the MultiIndex + + Returns + ------- + new_index : Index + """ + item = self._validate_fill_value(item) + + new_levels = [] + new_codes = [] + for k, level, level_codes in zip(item, self.levels, self.codes): + if k not in level: + # have to insert into level + # must insert at end otherwise you have to recompute all the + # other codes + lev_loc = len(level) + level = level.insert(lev_loc, k) + else: + lev_loc = level.get_loc(k) + + new_levels.append(level) + new_codes.append(np.insert(ensure_int64(level_codes), loc, lev_loc)) + + return MultiIndex( + levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False + ) + + def delete(self, loc) -> MultiIndex: + """ + Make new index with passed location deleted + + Returns + ------- + new_index : MultiIndex + """ + new_codes = [np.delete(level_codes, loc) for level_codes in self.codes] + return MultiIndex( + levels=self.levels, + codes=new_codes, + names=self.names, + verify_integrity=False, + ) + + @doc(Index.isin) + def isin(self, values, level=None) -> npt.NDArray[np.bool_]: + if level is None: + values = MultiIndex.from_tuples(values, names=self.names)._values + return algos.isin(self._values, values) + else: + num = self._get_level_number(level) + levs = self.get_level_values(num) + + if levs.size == 0: + return np.zeros(len(levs), dtype=np.bool_) + return levs.isin(values) + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "names"]) + def set_names(self, names, level=None, inplace: bool = False) -> MultiIndex | None: + return super().set_names(names=names, level=level, inplace=inplace) + + rename = set_names + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + def drop_duplicates(self, keep: str | bool = "first") -> MultiIndex: + return super().drop_duplicates(keep=keep) + + # --------------------------------------------------------------- + # Arithmetic/Numeric Methods - Disabled + + __add__ = make_invalid_op("__add__") + __radd__ = make_invalid_op("__radd__") + __iadd__ = make_invalid_op("__iadd__") + __sub__ = make_invalid_op("__sub__") + __rsub__ = make_invalid_op("__rsub__") + __isub__ = make_invalid_op("__isub__") + __pow__ = make_invalid_op("__pow__") + __rpow__ = make_invalid_op("__rpow__") + __mul__ = make_invalid_op("__mul__") + __rmul__ = make_invalid_op("__rmul__") + __floordiv__ = make_invalid_op("__floordiv__") + __rfloordiv__ = make_invalid_op("__rfloordiv__") + __truediv__ = make_invalid_op("__truediv__") + __rtruediv__ = make_invalid_op("__rtruediv__") + __mod__ = make_invalid_op("__mod__") + __rmod__ = make_invalid_op("__rmod__") + __divmod__ = make_invalid_op("__divmod__") + __rdivmod__ = make_invalid_op("__rdivmod__") + # Unary methods disabled + __neg__ = make_invalid_op("__neg__") + __pos__ = make_invalid_op("__pos__") + __abs__ = make_invalid_op("__abs__") + __invert__ = make_invalid_op("__invert__") + + +def _lexsort_depth(codes: list[np.ndarray], nlevels: int) -> int: + """Count depth (up to a maximum of `nlevels`) with which codes are lexsorted.""" + int64_codes = [ensure_int64(level_codes) for level_codes in codes] + for k in range(nlevels, 0, -1): + if libalgos.is_lexsorted(int64_codes[:k]): + return k + return 0 + + +def sparsify_labels(label_list, start: int = 0, sentinel=""): + pivoted = list(zip(*label_list)) + k = len(label_list) + + result = pivoted[: start + 1] + prev = pivoted[start] + + for cur in pivoted[start + 1 :]: + sparse_cur = [] + + for i, (p, t) in enumerate(zip(prev, cur)): + if i == k - 1: + sparse_cur.append(t) + result.append(sparse_cur) + break + + if p == t: + sparse_cur.append(sentinel) + else: + sparse_cur.extend(cur[i:]) + result.append(sparse_cur) + break + + prev = cur + + return list(zip(*result)) + + +def _get_na_rep(dtype) -> str: + return {np.datetime64: "NaT", np.timedelta64: "NaT"}.get(dtype, "NaN") + + +def maybe_droplevels(index: Index, key) -> Index: + """ + Attempt to drop level or levels from the given index. + + Parameters + ---------- + index: Index + key : scalar or tuple + + Returns + ------- + Index + """ + # drop levels + original_index = index + if isinstance(key, tuple): + for _ in key: + try: + index = index._drop_level_numbers([0]) + except ValueError: + # we have dropped too much, so back out + return original_index + else: + try: + index = index._drop_level_numbers([0]) + except ValueError: + pass + + return index + + +def _coerce_indexer_frozen(array_like, categories, copy: bool = False) -> np.ndarray: + """ + Coerce the array-like indexer to the smallest integer dtype that can encode all + of the given categories. + + Parameters + ---------- + array_like : array-like + categories : array-like + copy : bool + + Returns + ------- + np.ndarray + Non-writeable. + """ + array_like = coerce_indexer_dtype(array_like, categories) + if copy: + array_like = array_like.copy() + array_like.flags.writeable = False + return array_like + + +def _require_listlike(level, arr, arrname: str): + """ + Ensure that level is either None or listlike, and arr is list-of-listlike. + """ + if level is not None and not is_list_like(level): + if not is_list_like(arr): + raise TypeError(f"{arrname} must be list-like") + if is_list_like(arr[0]): + raise TypeError(f"{arrname} must be list-like") + level = [level] + arr = [arr] + elif level is None or is_list_like(level): + if not is_list_like(arr) or not is_list_like(arr[0]): + raise TypeError(f"{arrname} must be list of lists-like") + return level, arr From 5f4e7174ed8cf4ca538092cca27bc6c73e7d787a Mon Sep 17 00:00:00 2001 From: Khor Chean Wei Date: Tue, 15 Mar 2022 22:27:23 +0800 Subject: [PATCH 18/39] Update test_constructors.py --- pandas/tests/indexes/base_class/test_constructors.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index c40948fb12b41..759ffed5ff1d8 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -41,12 +41,9 @@ def test_constructor_cast(self): with pytest.raises(ValueError, match=msg): Index(["a", "b", "c"], dtype=float) - def test_construct_empty_tuples(self): + @pytest.mark.parametrize("tuple_list", [[()], [(), ()]]) + def test_construct_empty_tuples(self, tuple_list): # GH #45608 - result = Index([()]) - expected = Index([()], dtype="object") - tm.assert_index_equal(result, expected) - - result = Index([(), None]) - expected = Index([(), None], dtype="object") + result = Index(tuple_list) + expected = MultiIndex.from_tuples(tuple_list) tm.assert_index_equal(result, expected) From 1b23a4c1d3efb9a9118d44576dd639a4e021b170 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Tue, 15 Mar 2022 22:30:45 +0800 Subject: [PATCH 19/39] test --- pandas/tests/indexes/base_class/test_constructors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 759ffed5ff1d8..df04502a01f99 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -46,4 +46,5 @@ def test_construct_empty_tuples(self, tuple_list): # GH #45608 result = Index(tuple_list) expected = MultiIndex.from_tuples(tuple_list) + tm.assert_index_equal(result, expected) From 560e4c2a1f534d3e2c4e14d0f86f22552a32ea4e Mon Sep 17 00:00:00 2001 From: Khor Chean Wei Date: Wed, 16 Mar 2022 00:06:11 +0800 Subject: [PATCH 20/39] Update multi.py --- pandas/core/indexes/multi.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 538baaee1b961..09e3ca7a397fd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -489,10 +489,11 @@ def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> MultiIndex codes, levels = factorize_from_iterables(arrays) if all(isinstance(e, tuple) for e in arrays): - codes = [np.array([i for i in range(len(arrays))])] - _dtype_obj = np.dtype("object") - subarr = com.asarray_tuplesafe(arrays, dtype=_dtype_obj) - levels = [Index(subarr)] + if not all(arrays): + codes = [[i for i in range(len(arrays))]] + _dtype_obj = np.dtype("object") + subarr = com.asarray_tuplesafe(arrays, dtype=_dtype_obj) + levels = [Index(subarr)] if names is lib.no_default: names = [getattr(arr, "name", None) for arr in arrays] From bcbed46a05b4ce4425a4c28e2a779cd1335679ec Mon Sep 17 00:00:00 2001 From: Khor Chean Wei Date: Wed, 16 Mar 2022 00:51:24 +0800 Subject: [PATCH 21/39] Update multi.py --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 09e3ca7a397fd..69c9cc8fe009d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -490,7 +490,7 @@ def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> MultiIndex if all(isinstance(e, tuple) for e in arrays): if not all(arrays): - codes = [[i for i in range(len(arrays))]] + codes = list(i for i in range(len(arrays))) _dtype_obj = np.dtype("object") subarr = com.asarray_tuplesafe(arrays, dtype=_dtype_obj) levels = [Index(subarr)] From 8f2d3a51177a9e3191921416a4b45fbd0cbf858c Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Wed, 16 Mar 2022 01:00:54 +0800 Subject: [PATCH 22/39] pre-commit --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 69c9cc8fe009d..1f1b9f04d3dda 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -490,7 +490,7 @@ def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> MultiIndex if all(isinstance(e, tuple) for e in arrays): if not all(arrays): - codes = list(i for i in range(len(arrays))) + codes = [list(arrays)] _dtype_obj = np.dtype("object") subarr = com.asarray_tuplesafe(arrays, dtype=_dtype_obj) levels = [Index(subarr)] From 9c60f32b666b93ec0c594d43ca9c4bf4ee3c8a98 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Wed, 16 Mar 2022 16:45:52 +0800 Subject: [PATCH 23/39] pre commit --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1f1b9f04d3dda..02694de09c885 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -490,7 +490,7 @@ def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> MultiIndex if all(isinstance(e, tuple) for e in arrays): if not all(arrays): - codes = [list(arrays)] + codes = [[0 for _ in range(len(arrays))]] _dtype_obj = np.dtype("object") subarr = com.asarray_tuplesafe(arrays, dtype=_dtype_obj) levels = [Index(subarr)] From 5f1d3bd3f098d216d45f73d6fb75b62e2c8776f3 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Wed, 16 Mar 2022 16:50:02 +0800 Subject: [PATCH 24/39] pre commit --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 02694de09c885..3e331ef9ca039 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -490,7 +490,7 @@ def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> MultiIndex if all(isinstance(e, tuple) for e in arrays): if not all(arrays): - codes = [[0 for _ in range(len(arrays))]] + codes = [np.array([0 for _ in range(len(arrays))])] _dtype_obj = np.dtype("object") subarr = com.asarray_tuplesafe(arrays, dtype=_dtype_obj) levels = [Index(subarr)] From a792791aacec5177e13c42de7545f7987256d9a4 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Thu, 17 Mar 2022 21:04:36 +0800 Subject: [PATCH 25/39] form tuples --- pandas/core/indexes/multi.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index d632502b3af73..191230423e8fd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -488,13 +488,6 @@ def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> MultiIndex codes, levels = factorize_from_iterables(arrays) - if all(isinstance(e, tuple) for e in arrays): - if not all(arrays): - codes = [np.array([0 for _ in range(len(arrays))])] - _dtype_obj = np.dtype("object") - subarr = com.asarray_tuplesafe(arrays, dtype=_dtype_obj) - levels = [Index(subarr)] - if names is lib.no_default: names = [getattr(arr, "name", None) for arr in arrays] @@ -573,8 +566,18 @@ def from_tuples( if all(isinstance(e, tuple) for e in tuples): if not all(tuples): - return cls.from_arrays(tuples, sortorder=sortorder, names=names) + codes = [np.array([0 for _ in range(len(arrays))])] + _dtype_obj = np.dtype("object") + subarr = com.asarray_tuplesafe(arrays, dtype=_dtype_obj) + levels = [Index(subarr)] + return cls( + levels=levels, + codes=codes, + sortorder=sortorder, + names=names, + verify_integrity=False, + ) return cls.from_arrays(arrays, sortorder=sortorder, names=names) @classmethod From 0f84944b0bce2fdebf80e67f4390a9fd527f1a9d Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Thu, 17 Mar 2022 21:05:46 +0800 Subject: [PATCH 26/39] form tuples --- pandas/core/indexes/multi.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 191230423e8fd..469348af78a30 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -487,7 +487,6 @@ def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> MultiIndex raise ValueError("all arrays must be same length") codes, levels = factorize_from_iterables(arrays) - if names is lib.no_default: names = [getattr(arr, "name", None) for arr in arrays] From 7bd3ed1920a6f7a9b2f1ab57cb82a47844926603 Mon Sep 17 00:00:00 2001 From: Khor Chean Wei Date: Fri, 18 Mar 2022 20:56:34 +0800 Subject: [PATCH 27/39] Update multi.py --- pandas/core/indexes/multi.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c110541c0b914..d6a1db5b9326a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -562,20 +562,19 @@ def from_tuples( arrs = zip(*tuples) arrays = cast(List[Sequence[Hashable]], arrs) - if all(isinstance(e, tuple) for e in tuples): - if not all(tuples): - codes = [np.array([0 for _ in range(len(arrays))])] - _dtype_obj = np.dtype("object") - subarr = com.asarray_tuplesafe(arrays, dtype=_dtype_obj) - levels = [Index(subarr)] - - return cls( - levels=levels, - codes=codes, - sortorder=sortorder, - names=names, - verify_integrity=False, - ) + + if not all(tuples): + codes = [np.array([0 for _ in range(len(arrays))])] + levels = [Index(com.asarray_tuplesafe(arrays, dtype=np.dtype("object")))] + + return cls( + levels=levels, + codes=codes, + sortorder=sortorder, + names=names, + verify_integrity=False, + ) + return cls.from_arrays(arrays, sortorder=sortorder, names=names) @classmethod From 315808a41267435c2e7b3a5fac26a53a83fd36ed Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Fri, 18 Mar 2022 20:57:50 +0800 Subject: [PATCH 28/39] resolve comment --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index d6a1db5b9326a..1b9a70a75d67b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -562,8 +562,8 @@ def from_tuples( arrs = zip(*tuples) arrays = cast(List[Sequence[Hashable]], arrs) - if not all(tuples): + codes = [np.array([0 for _ in range(len(arrays))])] levels = [Index(com.asarray_tuplesafe(arrays, dtype=np.dtype("object")))] From 91844676ae08951a1c6b4c98deed3725d1830208 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Fri, 18 Mar 2022 23:04:05 +0800 Subject: [PATCH 29/39] resolve comment --- pandas/core/indexes/multi.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1b9a70a75d67b..99ff91deae19f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -562,8 +562,7 @@ def from_tuples( arrs = zip(*tuples) arrays = cast(List[Sequence[Hashable]], arrs) - if not all(tuples): - + if all(isinstance(e, tuple) for e in tuples) and not all(tuples): codes = [np.array([0 for _ in range(len(arrays))])] levels = [Index(com.asarray_tuplesafe(arrays, dtype=np.dtype("object")))] From af6f0679a83142fd5c6a9f57ffb9baf85b1f0353 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Sat, 19 Mar 2022 14:59:50 +0800 Subject: [PATCH 30/39] multi --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 99ff91deae19f..8b34dd35145f5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -562,7 +562,7 @@ def from_tuples( arrs = zip(*tuples) arrays = cast(List[Sequence[Hashable]], arrs) - if all(isinstance(e, tuple) for e in tuples) and not all(tuples): + if all(isinstance(e, tuple) and not e for e in tuples): codes = [np.array([0 for _ in range(len(arrays))])] levels = [Index(com.asarray_tuplesafe(arrays, dtype=np.dtype("object")))] From d4c499bcd545ffd77cd9dd1b7a1087454b360e24 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Sat, 19 Mar 2022 15:01:00 +0800 Subject: [PATCH 31/39] multi --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8b34dd35145f5..da85a57c39e05 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -562,7 +562,7 @@ def from_tuples( arrs = zip(*tuples) arrays = cast(List[Sequence[Hashable]], arrs) - if all(isinstance(e, tuple) and not e for e in tuples): + if all((isinstance(e, tuple) and not e) for e in tuples): codes = [np.array([0 for _ in range(len(arrays))])] levels = [Index(com.asarray_tuplesafe(arrays, dtype=np.dtype("object")))] From 71a1064e54b096ae83d99a003a6b57f49c7dba62 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Sat, 19 Mar 2022 15:02:58 +0800 Subject: [PATCH 32/39] np zero --- pandas/core/indexes/multi.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index da85a57c39e05..b72731e9e16a0 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -563,9 +563,8 @@ def from_tuples( arrays = cast(List[Sequence[Hashable]], arrs) if all((isinstance(e, tuple) and not e) for e in tuples): - codes = [np.array([0 for _ in range(len(arrays))])] + codes = [np.zeros(len(arrays))] levels = [Index(com.asarray_tuplesafe(arrays, dtype=np.dtype("object")))] - return cls( levels=levels, codes=codes, From 9215fc7fd76b6ed105017bf9c1bc6759260c946d Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Sat, 19 Mar 2022 16:44:50 +0800 Subject: [PATCH 33/39] tuples --- pandas/core/indexes/multi.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b72731e9e16a0..c879c231aef28 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -562,16 +562,19 @@ def from_tuples( arrs = zip(*tuples) arrays = cast(List[Sequence[Hashable]], arrs) - if all((isinstance(e, tuple) and not e) for e in tuples): - codes = [np.zeros(len(arrays))] - levels = [Index(com.asarray_tuplesafe(arrays, dtype=np.dtype("object")))] - return cls( - levels=levels, - codes=codes, - sortorder=sortorder, - names=names, - verify_integrity=False, - ) + if tuples: + if all((isinstance(e, tuple) and not e) for e in tuples): + codes = [np.zeros(len(arrays))] + levels = [ + Index(com.asarray_tuplesafe(arrays, dtype=np.dtype("object"))) + ] + return cls( + levels=levels, + codes=codes, + sortorder=sortorder, + names=names, + verify_integrity=False, + ) return cls.from_arrays(arrays, sortorder=sortorder, names=names) From 4798b4c23d6075d29fefd5f460fc0818beaa1c03 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Sat, 19 Mar 2022 18:28:42 +0800 Subject: [PATCH 34/39] tuple len --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c879c231aef28..d83a1e7420bd7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -562,7 +562,7 @@ def from_tuples( arrs = zip(*tuples) arrays = cast(List[Sequence[Hashable]], arrs) - if tuples: + if len(tuples) != 0: if all((isinstance(e, tuple) and not e) for e in tuples): codes = [np.zeros(len(arrays))] levels = [ From 79f00b46b37b9de7c8621eb4c9474d9b04479546 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Thu, 24 Mar 2022 19:52:33 +0800 Subject: [PATCH 35/39] tuple --- pandas/core/indexes/multi.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index d83a1e7420bd7..f895462ffba13 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -562,19 +562,16 @@ def from_tuples( arrs = zip(*tuples) arrays = cast(List[Sequence[Hashable]], arrs) - if len(tuples) != 0: - if all((isinstance(e, tuple) and not e) for e in tuples): - codes = [np.zeros(len(arrays))] - levels = [ - Index(com.asarray_tuplesafe(arrays, dtype=np.dtype("object"))) - ] - return cls( - levels=levels, - codes=codes, - sortorder=sortorder, - names=names, - verify_integrity=False, - ) + if all((isinstance(e, tuple) and not e) for e in tuples): + codes = [np.zeros(len(tuples))] + levels = [Index(com.asarray_tuplesafe(tuples, dtype=np.dtype("object")))] + return cls( + levels=levels, + codes=codes, + sortorder=sortorder, + names=names, + verify_integrity=False, + ) return cls.from_arrays(arrays, sortorder=sortorder, names=names) From 3e6a00a3e0c060c8235d18fc1de48af3510e7f21 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Thu, 24 Mar 2022 21:05:40 +0800 Subject: [PATCH 36/39] testing --- pandas/core/indexes/multi.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f895462ffba13..d980da5aef313 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -544,8 +544,19 @@ def from_tuples( raise TypeError("Input must be a list / sequence of tuple-likes.") elif is_iterator(tuples): tuples = list(tuples) - tuples = cast(Collection[Tuple[Hashable, ...]], tuples) + if len(tuples) and all((isinstance(e, tuple) and not e) for e in tuples): + codes = [np.zeros(len(tuples))] + levels = [Index(com.asarray_tuplesafe(tuples, dtype=np.dtype("object")))] + return cls( + levels=levels, + codes=codes, + sortorder=sortorder, + names=names, + verify_integrity=False, + ) + + tuples = cast(Collection[Tuple[Hashable, ...]], tuples) arrays: list[Sequence[Hashable]] if len(tuples) == 0: if names is None: @@ -562,17 +573,6 @@ def from_tuples( arrs = zip(*tuples) arrays = cast(List[Sequence[Hashable]], arrs) - if all((isinstance(e, tuple) and not e) for e in tuples): - codes = [np.zeros(len(tuples))] - levels = [Index(com.asarray_tuplesafe(tuples, dtype=np.dtype("object")))] - return cls( - levels=levels, - codes=codes, - sortorder=sortorder, - names=names, - verify_integrity=False, - ) - return cls.from_arrays(arrays, sortorder=sortorder, names=names) @classmethod From e725d03f28b98af99915d28cd9161e4a4b2a64ea Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Thu, 24 Mar 2022 23:06:01 +0800 Subject: [PATCH 37/39] types --- pandas/core/indexes/multi.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index d980da5aef313..b6e1b92527d6c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -544,7 +544,7 @@ def from_tuples( raise TypeError("Input must be a list / sequence of tuple-likes.") elif is_iterator(tuples): tuples = list(tuples) - + tuples = cast(Collection[Tuple[Hashable, ...]], tuples) if len(tuples) and all((isinstance(e, tuple) and not e) for e in tuples): codes = [np.zeros(len(tuples))] levels = [Index(com.asarray_tuplesafe(tuples, dtype=np.dtype("object")))] @@ -555,8 +555,6 @@ def from_tuples( names=names, verify_integrity=False, ) - - tuples = cast(Collection[Tuple[Hashable, ...]], tuples) arrays: list[Sequence[Hashable]] if len(tuples) == 0: if names is None: From 70eb65853efcdb3c73ed6b9d289fac58ad485453 Mon Sep 17 00:00:00 2001 From: "chean.wei.khor" Date: Fri, 25 Mar 2022 09:37:54 +0800 Subject: [PATCH 38/39] whatsnew --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/indexes/multi.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 50a5bf383de77..2dc02d166b9fd 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -485,7 +485,7 @@ MultiIndex - Bug in :meth:`DataFrame.loc` raising when slicing a :class:`MultiIndex` with a negative step size and slicing a non-int labeled index level (:issue:`46156`) - Bug in :meth:`Series.to_numpy` where multiindexed Series could not be converted to numpy arrays when an ``na_value`` was supplied (:issue:`45774`) - Bug in :class:`MultiIndex.equals` not commutative when only one side has extension array dtype (:issue:`46026`) -- +- bug in :meth:`MultiIndex.from_tuples` cannot construct Index of empty tuples (:issue:`45608`) I/O ^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b6e1b92527d6c..f41b3f80a25ad 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -545,7 +545,9 @@ def from_tuples( elif is_iterator(tuples): tuples = list(tuples) tuples = cast(Collection[Tuple[Hashable, ...]], tuples) - if len(tuples) and all((isinstance(e, tuple) and not e) for e in tuples): + + # handling the empty tuple cases + if len(tuples) and all(isinstance(e, tuple) and not e for e in tuples): codes = [np.zeros(len(tuples))] levels = [Index(com.asarray_tuplesafe(tuples, dtype=np.dtype("object")))] return cls( @@ -555,6 +557,7 @@ def from_tuples( names=names, verify_integrity=False, ) + arrays: list[Sequence[Hashable]] if len(tuples) == 0: if names is None: From 3f15379af6158c57ae91111941eeacacb1770e7f Mon Sep 17 00:00:00 2001 From: Khor Chean Wei Date: Sat, 26 Mar 2022 11:49:09 +0800 Subject: [PATCH 39/39] Update v1.5.0.rst --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 2dc02d166b9fd..bd515b0b093e3 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -485,7 +485,7 @@ MultiIndex - Bug in :meth:`DataFrame.loc` raising when slicing a :class:`MultiIndex` with a negative step size and slicing a non-int labeled index level (:issue:`46156`) - Bug in :meth:`Series.to_numpy` where multiindexed Series could not be converted to numpy arrays when an ``na_value`` was supplied (:issue:`45774`) - Bug in :class:`MultiIndex.equals` not commutative when only one side has extension array dtype (:issue:`46026`) -- bug in :meth:`MultiIndex.from_tuples` cannot construct Index of empty tuples (:issue:`45608`) +- Bug in :meth:`MultiIndex.from_tuples` cannot construct Index of empty tuples (:issue:`45608`) I/O ^^^