Skip to content

ENH: Index[bool] #45061

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 36 commits into from
Feb 5, 2022
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
7ef2dba
ENH/WIP: Index[bool]
jbrockmendel Dec 24, 2021
f912217
Merge branch 'master' into enh-bool-index
jbrockmendel Dec 28, 2021
f15b927
Merge branch 'master' into enh-bool-index
jbrockmendel Dec 28, 2021
5142a2d
failing tests
jbrockmendel Dec 28, 2021
a30a485
Merge branch 'master' into enh-bool-index
jbrockmendel Dec 29, 2021
0db360a
mypy fixup
jbrockmendel Dec 29, 2021
84faf3e
Merge branch 'master' into enh-bool-index
jbrockmendel Dec 29, 2021
5a1b1f0
Merge branch 'master' into enh-bool-index
jbrockmendel Dec 30, 2021
b67bab1
xfail
jbrockmendel Dec 30, 2021
89a1cab
Merge branch 'master' into enh-bool-index
jbrockmendel Jan 1, 2022
3b2f6e3
fix Boolean GroupBy tests
jbrockmendel Jan 2, 2022
1d28d25
Merge branch 'master' into enh-bool-index
jbrockmendel Jan 7, 2022
8cf7dfe
Merge branch 'master' into enh-bool-index
jbrockmendel Jan 7, 2022
298757c
document value_counts behavior, skips and xfails
jbrockmendel Jan 7, 2022
33ca864
Merge branch 'master' into enh-bool-index
jbrockmendel Jan 12, 2022
16d5202
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 14, 2022
f2c67a9
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 15, 2022
7f2edc3
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 26, 2022
ad59ca1
post-merge cleanup
jbrockmendel Jan 26, 2022
dc347b8
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 26, 2022
44a49af
fix should_compare
jbrockmendel Jan 26, 2022
a2b4de5
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 29, 2022
9b57f13
whatsnew
jbrockmendel Jan 29, 2022
c05a968
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 29, 2022
26105c0
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 30, 2022
04ede77
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 30, 2022
941986c
patch IntervalIndex._can_hold_na
jbrockmendel Jan 30, 2022
5ba5a47
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 31, 2022
3dd23bb
remove duplicate line
jbrockmendel Jan 31, 2022
ce19666
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 31, 2022
7924be4
Merge branch 'main' into enh-bool-index
jbrockmendel Feb 1, 2022
589a72a
Merge branch 'main' into enh-bool-index
jbrockmendel Feb 1, 2022
5b46f11
Merge branch 'main' into enh-bool-index
jbrockmendel Feb 1, 2022
8c91b1c
fix _check_dtype for BoolEngine
jbrockmendel Feb 1, 2022
0b73eba
Merge branch 'main' into enh-bool-index
jbrockmendel Feb 2, 2022
c46d3a3
remove no-longer-needed
jbrockmendel Feb 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Other enhancements
- :meth:`to_numeric` now preserves float64 arrays when downcasting would generate values not representable in float32 (:issue:`43693`)
- :meth:`Series.reset_index` and :meth:`DataFrame.reset_index` now support the argument ``allow_duplicates`` (:issue:`44410`)
- :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba <https://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`45428`)
- Implemented a ``bool``-dtype :class:`Index`, passing a bool-dtype arraylike to ``pd.Index`` will now retain ``bool`` dtype instead of casting to ``object`` (:issue:`45061`)
-

.. ---------------------------------------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/index.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class ObjectEngine(IndexEngine): ...
class DatetimeEngine(Int64Engine): ...
class TimedeltaEngine(DatetimeEngine): ...
class PeriodEngine(Int64Engine): ...
class BoolEngine(UInt8Engine): ...

class BaseMultiIndexCodesEngine:
levels: list[np.ndarray]
Expand Down
6 changes: 6 additions & 0 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,12 @@ cdef class BaseMultiIndexCodesEngine:
include "index_class_helper.pxi"


cdef class BoolEngine(UInt8Engine):
cdef _check_type(self, object val):
if not util.is_bool_object(val):
raise KeyError(val)


@cython.internal
@cython.freelist(32)
cdef class SharedEngine:
Expand Down
5 changes: 3 additions & 2 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,8 @@ def _create_mi_with_dt64tz_level():
"num_uint8": tm.makeNumericIndex(100, dtype="uint8"),
"num_float64": tm.makeNumericIndex(100, dtype="float64"),
"num_float32": tm.makeNumericIndex(100, dtype="float32"),
"bool": tm.makeBoolIndex(10),
"bool-object": tm.makeBoolIndex(10).astype(object),
"bool-dtype": Index(np.random.randn(10) < 0),
"categorical": tm.makeCategoricalIndex(100),
"interval": tm.makeIntervalIndex(100),
"empty": Index([]),
Expand Down Expand Up @@ -630,7 +631,7 @@ def index_flat_unique(request):
key
for key in indices_dict
if not (
key in ["int", "uint", "range", "empty", "repeats"]
key in ["int", "uint", "range", "empty", "repeats", "bool-dtype"]
or key.startswith("num_")
)
and not isinstance(indices_dict[key], MultiIndex)
Expand Down
16 changes: 11 additions & 5 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,6 @@ def _reconstruct_data(
elif is_bool_dtype(dtype):
values = values.astype(dtype, copy=False)

# we only support object dtypes bool Index
if isinstance(original, ABCIndex):
values = values.astype(object, copy=False)
elif dtype is not None:
if is_datetime64_dtype(dtype):
dtype = np.dtype("datetime64[ns]")
Expand Down Expand Up @@ -830,7 +827,10 @@ def value_counts(
-------
Series
"""
from pandas.core.series import Series
from pandas import (
Index,
Series,
)

name = getattr(values, "name", None)

Expand Down Expand Up @@ -868,7 +868,13 @@ def value_counts(
else:
keys, counts = value_counts_arraylike(values, dropna)

result = Series(counts, index=keys, name=name)
# For backwards compatibility, we let Index do its normal type
# inference, _except_ for if if infers from object to bool.
idx = Index._with_infer(keys)
if idx.dtype == bool and keys.dtype == object:
idx = idx.astype(object)

result = Series(counts, index=idx, name=name)

if sort:
result = result.sort_values(ascending=ascending)
Expand Down
28 changes: 24 additions & 4 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,6 @@ def _outer_indexer(
_comparables: list[str] = ["name"]
_attributes: list[str] = ["name"]
_is_numeric_dtype: bool = False
_can_hold_na: bool = True
_can_hold_strings: bool = True

# Whether this index is a NumericIndex, but not a Int64Index, Float64Index,
Expand Down Expand Up @@ -505,6 +504,10 @@ def __new__(
if data.dtype.kind in ["i", "u", "f"]:
# maybe coerce to a sub-class
arr = data
elif data.dtype.kind == "b":
# No special subclass, and Index._ensure_array won't do this
# for us.
arr = np.asarray(data)
else:
arr = com.asarray_tuplesafe(data, dtype=_dtype_obj)

Expand Down Expand Up @@ -702,7 +705,7 @@ def _with_infer(cls, *args, **kwargs):
# "Union[ExtensionArray, ndarray[Any, Any]]"; expected
# "ndarray[Any, Any]"
values = lib.maybe_convert_objects(result._values) # type: ignore[arg-type]
if values.dtype.kind in ["i", "u", "f"]:
if values.dtype.kind in ["i", "u", "f", "b"]:
return Index(values, name=result.name)

return result
Expand Down Expand Up @@ -872,9 +875,12 @@ def _engine(
):
return libindex.ExtensionEngine(target_values)

target_values = cast(np.ndarray, target_values)
# to avoid a reference cycle, bind `target_values` to a local variable, so
# `self` is not passed into the lambda.
target_values = cast(np.ndarray, target_values)
if target_values.dtype == bool:
return libindex.BoolEngine(target_values)

# error: Argument 1 to "ExtensionEngine" has incompatible type
# "ndarray[Any, Any]"; expected "ExtensionArray"
return self._engine_type(target_values) # type:ignore[arg-type]
Expand Down Expand Up @@ -2205,6 +2211,14 @@ def _get_grouper_for_level(self, mapper, *, level=None):
# --------------------------------------------------------------------
# Introspection Methods

@cache_readonly
def _can_hold_na(self) -> bool:
if isinstance(self.dtype, ExtensionDtype):
return self.dtype._can_hold_na
if self.dtype.kind in ["i", "u", "b"]:
return False
return True

@final
@property
def is_monotonic(self) -> bool:
Expand Down Expand Up @@ -2665,6 +2679,8 @@ def _is_all_dates(self) -> bool:
"""
Whether or not the index values only consist of dates.
"""
if self.dtype.kind == "b":
return False
return is_datetime_array(ensure_object(self._values))

@cache_readonly
Expand Down Expand Up @@ -6157,6 +6173,10 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
"""
Can we compare values of the given dtype to our own?
"""
if self.dtype.kind == "b":
return dtype.kind == "b"
if is_numeric_dtype(self.dtype):
return is_numeric_dtype(dtype)
return True

@final
Expand Down Expand Up @@ -7274,7 +7294,7 @@ def _maybe_cast_data_without_dtype(
FutureWarning,
stacklevel=3,
)
if result.dtype.kind in ["b", "c"]:
if result.dtype.kind in ["c"]:
return subarr
result = ensure_wrapped_if_datetimelike(result)
return result
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,11 @@ class IntervalIndex(ExtensionIndex):
_can_hold_strings = False
_data_cls = IntervalArray

# FIXME: this is inaccurate for integer-backed IntervalArray, but
# is the pre-existing behavior before GH#45061 (Index[bool]).
# Without this, other.categories.take raises in IntervalArray._cmp_method
_can_hold_na = True

# --------------------------------------------------------------------
# Constructors

Expand Down
13 changes: 0 additions & 13 deletions pandas/core/indexes/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
)
from pandas._typing import (
Dtype,
DtypeObj,
npt,
)
from pandas.util._decorators import (
Expand Down Expand Up @@ -92,14 +91,6 @@ class NumericIndex(Index):
_can_hold_strings = False
_is_backward_compat_public_numeric_index: bool = True

# error: Signature of "_can_hold_na" incompatible with supertype "Index"
@cache_readonly
def _can_hold_na(self) -> bool: # type: ignore[override]
if is_float_dtype(self.dtype):
return True
else:
return False

_engine_types: dict[np.dtype, type[libindex.IndexEngine]] = {
np.dtype(np.int8): libindex.Int8Engine,
np.dtype(np.int16): libindex.Int16Engine,
Expand Down Expand Up @@ -285,10 +276,6 @@ def _convert_tolerance(self, tolerance, target):
)
return tolerance

def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
# If we ever have BoolIndex or ComplexIndex, this may need to be tightened
return is_numeric_dtype(dtype)

@classmethod
def _assert_safe_casting(cls, data: np.ndarray, subarr: np.ndarray) -> None:
"""
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1076,6 +1076,8 @@ def to_datetime(
result = convert_listlike(arg, format)
else:
result = convert_listlike(np.array([arg]), format)[0]
if isinstance(arg, bool) and isinstance(result, np.bool_):
result = bool(result) # TODO: avoid this kludge.

# error: Incompatible return value type (got "Union[Timestamp, NaTType,
# Series, Index]", expected "Union[DatetimeIndex, Series, float, str,
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/util/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def _hash_ndarray(

# First, turn whatever array this is into unsigned 64-bit ints, if we can
# manage it.
elif isinstance(dtype, bool):
elif dtype == bool:
vals = vals.astype("u8")
elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
vals = vals.view("i8").astype("u8", copy=False)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def test_value_counts_with_nan(dropna, index_or_series):
obj = klass(values)
res = obj.value_counts(dropna=dropna)
if dropna is True:
expected = Series([1], index=[True])
expected = Series([1], index=Index([True], dtype=obj.dtype))
else:
expected = Series([1, 1, 1], index=[True, pd.NA, np.nan])
tm.assert_series_equal(res, expected)
5 changes: 5 additions & 0 deletions pandas/tests/indexes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@ def test_ensure_copied_data(self, index):
# RangeIndex cannot be initialized from data
# MultiIndex and CategoricalIndex are tested separately
return
elif index.dtype == object and index.inferred_type == "boolean":
init_kwargs["dtype"] = index.dtype

index_type = type(index)
result = index_type(index.values, copy=True, **init_kwargs)
Expand Down Expand Up @@ -522,6 +524,9 @@ def test_fillna(self, index):
# GH 11343
if len(index) == 0:
return
elif index.dtype == bool:
# can't hold NAs
return
elif isinstance(index, NumericIndex) and is_integer_dtype(index.dtype):
return
elif isinstance(index, MultiIndex):
Expand Down
19 changes: 14 additions & 5 deletions pandas/tests/indexes/multi/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,13 +621,22 @@ def test_get_loc_implicit_cast(self, level, dtypes):
idx = MultiIndex.from_product(levels)
assert idx.get_loc(tuple(key)) == 3

def test_get_loc_cast_bool(self):
# GH 19086 : int is casted to bool, but not vice-versa
levels = [[False, True], np.arange(2, dtype="int64")]
@pytest.mark.parametrize("dtype", [bool, object])
def test_get_loc_cast_bool(self, dtype):
# GH 19086 : int is casted to bool, but not vice-versa (for object dtype)
# With bool dtype, we don't cast in either direction.
levels = [Index([False, True], dtype=dtype), np.arange(2, dtype="int64")]
idx = MultiIndex.from_product(levels)

assert idx.get_loc((0, 1)) == 1
assert idx.get_loc((1, 0)) == 2
if dtype is bool:
with pytest.raises(KeyError, match=r"^\(0, 1\)$"):
assert idx.get_loc((0, 1)) == 1
with pytest.raises(KeyError, match=r"^\(1, 0\)$"):
assert idx.get_loc((1, 0)) == 2
else:
# We use python object comparisons, which treat 0 == False and 1 == True
assert idx.get_loc((0, 1)) == 1
assert idx.get_loc((1, 0)) == 2

with pytest.raises(KeyError, match=r"^\(False, True\)$"):
idx.get_loc((False, True))
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/indexes/test_any_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ def test_mutability(index):
def test_map_identity_mapping(index):
# GH#12766
result = index.map(lambda x: x)
if index.dtype == object and result.dtype == bool:
assert (index == result).all()
# TODO: could work that into the 'exact="equiv"'?
return # FIXME: doesn't belong in this file anymore!
tm.assert_index_equal(result, index, exact="equiv")


Expand Down
27 changes: 18 additions & 9 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,15 +321,21 @@ def test_view_with_args(self, index):
"unicode",
"string",
pytest.param("categorical", marks=pytest.mark.xfail(reason="gh-25464")),
"bool",
"bool-object",
"bool-dtype",
"empty",
],
indirect=True,
)
def test_view_with_args_object_array_raises(self, index):
msg = "Cannot change data-type for object array"
with pytest.raises(TypeError, match=msg):
index.view("i8")
if index.dtype == bool:
msg = "When changing to a larger dtype"
with pytest.raises(ValueError, match=msg):
index.view("i8")
else:
msg = "Cannot change data-type for object array"
with pytest.raises(TypeError, match=msg):
index.view("i8")

@pytest.mark.parametrize("index", ["int", "range"], indirect=True)
def test_astype(self, index):
Expand Down Expand Up @@ -397,9 +403,9 @@ def test_is_(self):

def test_asof_numeric_vs_bool_raises(self):
left = Index([1, 2, 3])
right = Index([True, False])
right = Index([True, False], dtype=object)

msg = "Cannot compare dtypes int64 and object"
msg = "Cannot compare dtypes int64 and bool"
with pytest.raises(TypeError, match=msg):
left.asof(right[0])
# TODO: should right.asof(left[0]) also raise?
Expand Down Expand Up @@ -591,7 +597,8 @@ def test_append_empty_preserve_name(self, name, expected):
"index, expected",
[
("string", False),
("bool", False),
("bool-object", False),
("bool-dtype", False),
("categorical", False),
("int", True),
("datetime", False),
Expand All @@ -606,7 +613,8 @@ def test_is_numeric(self, index, expected):
"index, expected",
[
("string", True),
("bool", True),
("bool-object", True),
("bool-dtype", False),
("categorical", False),
("int", False),
("datetime", False),
Expand All @@ -621,7 +629,8 @@ def test_is_object(self, index, expected):
"index, expected",
[
("string", False),
("bool", False),
("bool-object", False),
("bool-dtype", False),
("categorical", False),
("int", False),
("datetime", True),
Expand Down
7 changes: 6 additions & 1 deletion pandas/tests/indexes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@ def test_constructor_non_hashable_name(self, index_flat):

def test_constructor_unwraps_index(self, index_flat):
a = index_flat
b = type(a)(a)
# Passing dtype is necessary for Index([True, False], dtype=object)
# case.
b = type(a)(a, dtype=a.dtype)
tm.assert_equal(a._data, b._data)

def test_to_flat_index(self, index_flat):
Expand Down Expand Up @@ -426,6 +428,9 @@ def test_hasnans_isnans(self, index_flat):
return
elif isinstance(index, NumericIndex) and is_integer_dtype(index.dtype):
return
elif index.dtype == bool:
# values[1] = np.nan below casts to True!
return

values[1] = np.nan

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/test_index_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def test_constructor_dtypes_to_object(self, cast_index, vals):
index = Index(vals)

assert type(index) is Index
assert index.dtype == object
assert index.dtype == bool

def test_constructor_categorical_to_object(self):
# GH#32167 Categorical data and dtype=object should return object-dtype
Expand Down
Loading