Skip to content

PERF: dtype checks #52766

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

from pandas.core.dtypes.common import (
is_bool,
is_extension_array_dtype,
is_integer_dtype,
is_number,
is_numeric_dtype,
Expand Down Expand Up @@ -316,7 +315,7 @@ def _get_ilevel_values(index, level):
if not left.equals(right):
mismatch = left._values != right._values

if is_extension_array_dtype(mismatch):
if not isinstance(mismatch, np.ndarray):
mismatch = cast("ExtensionArray", mismatch).fillna(True)

diff = np.sum(mismatch.astype(int)) * 100.0 / len(left)
Expand Down
7 changes: 3 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
is_integer,
is_integer_dtype,
is_list_like,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_signed_integer_dtype,
Expand Down Expand Up @@ -471,7 +470,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]:

if (
len(values) > 0
and is_numeric_dtype(values.dtype)
and values.dtype.kind in "iufcb"
and not is_signed_integer_dtype(comps)
):
# GH#46485 Use object to avoid upcast to float64 later
Expand Down Expand Up @@ -1403,7 +1402,7 @@ def diff(arr, n: int, axis: AxisInt = 0):
)

is_timedelta = False
if needs_i8_conversion(arr.dtype):
if arr.dtype.kind in "mM":
dtype = np.int64
arr = arr.view("i8")
na = iNaT
Expand All @@ -1413,7 +1412,7 @@ def diff(arr, n: int, axis: AxisInt = 0):
# We have to cast in order to be able to hold np.nan
dtype = np.object_

elif is_integer_dtype(dtype):
elif dtype.kind in "iu":
# We have to cast in order to be able to hold np.nan

# int8, int16 are incompatible with float64,
Expand Down
9 changes: 4 additions & 5 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
is_bool_dtype,
is_dict_like,
is_dtype_equal,
is_extension_array_dtype,
is_hashable,
is_integer_dtype,
is_list_like,
Expand Down Expand Up @@ -618,7 +617,7 @@ def _from_inferred_categories(

if known_categories:
# Convert to a specialized type with `dtype` if specified.
if is_any_real_numeric_dtype(dtype.categories):
if is_any_real_numeric_dtype(dtype.categories.dtype):
cats = to_numeric(inferred_categories, errors="coerce")
elif lib.is_np_dtype(dtype.categories.dtype, "M"):
cats = to_datetime(inferred_categories, errors="coerce")
Expand Down Expand Up @@ -701,7 +700,7 @@ def from_codes(
)
raise ValueError(msg)

if is_extension_array_dtype(codes) and is_integer_dtype(codes):
if isinstance(codes, ExtensionArray) and is_integer_dtype(codes.dtype):
# Avoid the implicit conversion of Int to object
if isna(codes).any():
raise ValueError("codes cannot contain NA values")
Expand Down Expand Up @@ -1598,7 +1597,7 @@ def _internal_get_values(self):
# if we are a datetime and period index, return Index to keep metadata
if needs_i8_conversion(self.categories.dtype):
return self.categories.take(self._codes, fill_value=NaT)
elif is_integer_dtype(self.categories) and -1 in self._codes:
elif is_integer_dtype(self.categories.dtype) and -1 in self._codes:
return self.categories.astype("object").take(self._codes, fill_value=np.nan)
return np.array(self)

Expand Down Expand Up @@ -1809,7 +1808,7 @@ def _values_for_rank(self) -> np.ndarray:
if mask.any():
values = values.astype("float64")
values[mask] = np.nan
elif is_any_real_numeric_dtype(self.categories):
elif is_any_real_numeric_dtype(self.categories.dtype):
values = np.array(self)
else:
# reorder the categories (so rank can use the float codes)
Expand Down
7 changes: 1 addition & 6 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
is_datetime64_any_dtype,
is_dtype_equal,
is_float_dtype,
is_object_dtype,
is_sparse,
is_string_dtype,
pandas_dtype,
Expand Down Expand Up @@ -2038,11 +2037,7 @@ def _sequence_to_dt64ns(
if out_unit is not None:
out_dtype = np.dtype(f"M8[{out_unit}]")

if (
is_object_dtype(data_dtype)
or is_string_dtype(data_dtype)
or is_sparse(data_dtype)
):
if data_dtype == object or is_string_dtype(data_dtype) or is_sparse(data_dtype):
# TODO: We do not have tests specific to string-dtypes,
# also complex or categorical or other extension
copy = False
Expand Down
32 changes: 17 additions & 15 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,9 @@
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.common import (
is_bool,
is_bool_dtype,
is_dtype_equal,
is_float_dtype,
is_integer_dtype,
is_list_like,
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
Expand Down Expand Up @@ -408,9 +405,11 @@ def to_numpy(
na_value = libmissing.NA
if dtype is None:
dtype = object
else:
dtype = np.dtype(dtype)
if self._hasna:
if (
not is_object_dtype(dtype)
dtype != object
and not is_string_dtype(dtype)
and na_value is libmissing.NA
):
Expand Down Expand Up @@ -545,7 +544,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
else:
inputs2.append(x)

def reconstruct(x):
def reconstruct(x: np.ndarray):
# we don't worry about scalar `x` here, since we
# raise for reduce up above.
from pandas.core.arrays import (
Expand All @@ -554,13 +553,13 @@ def reconstruct(x):
IntegerArray,
)

if is_bool_dtype(x.dtype):
if x.dtype.kind == "b":
m = mask.copy()
return BooleanArray(x, m)
elif is_integer_dtype(x.dtype):
elif x.dtype.kind in "iu":
m = mask.copy()
return IntegerArray(x, m)
elif is_float_dtype(x.dtype):
elif x.dtype.kind == "f":
m = mask.copy()
if x.dtype == np.float16:
# reached in e.g. np.sqrt on BooleanArray
Expand Down Expand Up @@ -763,7 +762,9 @@ def _cmp_method(self, other, op) -> BooleanArray:
mask = self._propagate_mask(mask, other)
return BooleanArray(result, mask, copy=False)

def _maybe_mask_result(self, result, mask):
def _maybe_mask_result(
self, result: np.ndarray | tuple[np.ndarray, np.ndarray], mask: np.ndarray
):
"""
Parameters
----------
Expand All @@ -778,12 +779,12 @@ def _maybe_mask_result(self, result, mask):
self._maybe_mask_result(mod, mask),
)

if is_float_dtype(result.dtype):
if result.dtype.kind == "f":
from pandas.core.arrays import FloatingArray

return FloatingArray(result, mask, copy=False)

elif is_bool_dtype(result.dtype):
elif result.dtype.kind == "b":
from pandas.core.arrays import BooleanArray

return BooleanArray(result, mask, copy=False)
Expand All @@ -794,13 +795,14 @@ def _maybe_mask_result(self, result, mask):
# e.g. test_numeric_arr_mul_tdscalar_numexpr_path
from pandas.core.arrays import TimedeltaArray

result[mask] = result.dtype.type("NaT")

if not isinstance(result, TimedeltaArray):
result = TimedeltaArray._simple_new(result, dtype=result.dtype)
return TimedeltaArray._simple_new(result, dtype=result.dtype)

result[mask] = result.dtype.type("NaT")
return result

elif is_integer_dtype(result.dtype):
elif result.dtype.kind in "iu":
from pandas.core.arrays import IntegerArray

return IntegerArray(result, mask, copy=False)
Expand Down Expand Up @@ -875,7 +877,7 @@ def isin(self, values) -> BooleanArray: # type: ignore[override]
result = isin(self._data, values_arr)

if self._hasna:
values_have_NA = is_object_dtype(values_arr.dtype) and any(
values_have_NA = values_arr.dtype == object and any(
val is self.dtype.na_value for val in values_arr
)

Expand Down
9 changes: 4 additions & 5 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,14 @@
from pandas.core.dtypes.common import (
TD64NS_DTYPE,
is_dtype_equal,
is_extension_array_dtype,
is_float_dtype,
is_integer_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
is_timedelta64_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.missing import isna

from pandas.core import (
Expand Down Expand Up @@ -137,7 +136,7 @@ class TimedeltaArray(dtl.TimelikeOps):
_typ = "timedeltaarray"
_internal_fill_value = np.timedelta64("NaT", "ns")
_recognized_scalars = (timedelta, np.timedelta64, Tick)
_is_recognized_dtype = is_timedelta64_dtype
_is_recognized_dtype = lambda x: lib.is_np_dtype(x, "m")
_infer_matches = ("timedelta", "timedelta64")

@property
Expand Down Expand Up @@ -912,7 +911,7 @@ def sequence_to_td64ns(
inferred_freq = data.freq

# Convert whatever we have into timedelta64[ns] dtype
if is_object_dtype(data.dtype) or is_string_dtype(data.dtype):
if data.dtype == object or is_string_dtype(data.dtype):
# no need to make a copy, need to convert if string-dtyped
data = _objects_to_td64ns(data, unit=unit, errors=errors)
copy = False
Expand All @@ -925,7 +924,7 @@ def sequence_to_td64ns(
elif is_float_dtype(data.dtype):
# cast the unit, multiply base/frac separately
# to avoid precision issues from float -> int
if is_extension_array_dtype(data.dtype):
if isinstance(data.dtype, ExtensionDtype):
mask = data._mask
data = data._data
else:
Expand Down
13 changes: 5 additions & 8 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@
is_extension_array_dtype,
is_float,
is_integer,
is_integer_dtype,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
Expand Down Expand Up @@ -472,7 +470,7 @@ def maybe_cast_pointwise_result(
else:
result = maybe_cast_to_extension_array(cls, result)

elif (numeric_only and is_numeric_dtype(dtype)) or not numeric_only:
elif (numeric_only and dtype.kind in "iufcb") or not numeric_only:
result = maybe_downcast_to_dtype(result, dtype)

return result
Expand Down Expand Up @@ -1041,13 +1039,13 @@ def convert_dtypes(
if convert_integer:
target_int_dtype = pandas_dtype_func("Int64")

if is_integer_dtype(input_array.dtype):
if input_array.dtype.kind in "iu":
from pandas.core.arrays.integer import INT_STR_TO_DTYPE

inferred_dtype = INT_STR_TO_DTYPE.get(
input_array.dtype.name, target_int_dtype
)
elif is_numeric_dtype(input_array.dtype):
elif input_array.dtype.kind in "fcb":
# TODO: de-dup with maybe_cast_to_integer_array?
arr = input_array[notna(input_array)]
if (arr.astype(int) == arr).all():
Expand All @@ -1062,9 +1060,8 @@ def convert_dtypes(
inferred_dtype = target_int_dtype

if convert_floating:
if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
input_array.dtype
):
if input_array.dtype.kind in "fcb":
# i.e. numeric but not integer
from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE

inferred_float_dtype: DtypeObj = FLOAT_STR_TO_DTYPE.get(
Expand Down
13 changes: 6 additions & 7 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def classes(*klasses) -> Callable:
return lambda tipo: issubclass(tipo, klasses)


def classes_and_not_datetimelike(*klasses) -> Callable:
def _classes_and_not_datetimelike(*klasses) -> Callable:
"""
Evaluate if the tipo is a subclass of the klasses
and not a datetimelike.
Expand Down Expand Up @@ -654,7 +654,7 @@ def is_integer_dtype(arr_or_dtype) -> bool:
False
"""
return _is_dtype_type(
arr_or_dtype, classes_and_not_datetimelike(np.integer)
arr_or_dtype, _classes_and_not_datetimelike(np.integer)
) or _is_dtype(
arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind in "iu"
)
Expand Down Expand Up @@ -713,7 +713,7 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool:
False
"""
return _is_dtype_type(
arr_or_dtype, classes_and_not_datetimelike(np.signedinteger)
arr_or_dtype, _classes_and_not_datetimelike(np.signedinteger)
) or _is_dtype(
arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind == "i"
)
Expand Down Expand Up @@ -763,7 +763,7 @@ def is_unsigned_integer_dtype(arr_or_dtype) -> bool:
True
"""
return _is_dtype_type(
arr_or_dtype, classes_and_not_datetimelike(np.unsignedinteger)
arr_or_dtype, _classes_and_not_datetimelike(np.unsignedinteger)
) or _is_dtype(
arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind == "u"
)
Expand Down Expand Up @@ -1087,7 +1087,7 @@ def is_numeric_dtype(arr_or_dtype) -> bool:
False
"""
return _is_dtype_type(
arr_or_dtype, classes_and_not_datetimelike(np.number, np.bool_)
arr_or_dtype, _classes_and_not_datetimelike(np.number, np.bool_)
) or _is_dtype(
arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ._is_numeric
)
Expand Down Expand Up @@ -1490,7 +1490,7 @@ def infer_dtype_from_object(dtype) -> type:
except TypeError:
pass

if is_extension_array_dtype(dtype):
if isinstance(dtype, ExtensionDtype):
return dtype.type
elif isinstance(dtype, str):
# TODO(jreback)
Expand Down Expand Up @@ -1644,7 +1644,6 @@ def is_all_strings(value: ArrayLike) -> bool:

__all__ = [
"classes",
"classes_and_not_datetimelike",
"DT64NS_DTYPE",
"ensure_float64",
"ensure_python_int",
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6138,7 +6138,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
elif is_numeric_dtype(self.dtype):
return is_numeric_dtype(dtype)
# TODO: this was written assuming we only get here with object-dtype,
# which is nom longer correct. Can we specialize for EA?
# which is no longer correct. Can we specialize for EA?
return True

@final
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
ensure_platform_int,
is_datetime64_ns_dtype,
is_dtype_equal,
is_extension_array_dtype,
is_integer,
is_numeric_dtype,
is_object_dtype,
Expand Down Expand Up @@ -1125,7 +1124,7 @@ def as_array(
dtype = dtype.subtype
elif isinstance(dtype, PandasDtype):
dtype = dtype.numpy_dtype
elif is_extension_array_dtype(dtype):
elif isinstance(dtype, ExtensionDtype):
dtype = "object"
elif is_dtype_equal(dtype, str):
dtype = "object"
Expand Down
Loading