Skip to content

PERF: better perf on _ensure_data in core/algorithms, helping perf of unique, duplicated, factorize #16046

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 18, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 34 additions & 37 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from pandas.core.dtypes.common import (
is_unsigned_integer_dtype, is_signed_integer_dtype,
is_integer_dtype, is_complex_dtype,
is_object_dtype,
is_categorical_dtype, is_sparse,
is_period_dtype,
is_numeric_dtype, is_float_dtype,
Expand Down Expand Up @@ -63,6 +64,35 @@ def _ensure_data(values, dtype=None):

"""

# we check some simple dtypes first
try:
if is_bool_dtype(values) or is_bool_dtype(dtype):
# we are actually coercing to uint64
# until our algos suppport uint8 directly (see TODO)
return np.asarray(values).astype('uint64'), 'bool', 'uint64'
elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype):
return _ensure_int64(values), 'int64', 'int64'
elif (is_unsigned_integer_dtype(values) or
is_unsigned_integer_dtype(dtype)):
return _ensure_uint64(values), 'uint64', 'uint64'
elif is_float_dtype(values) or is_float_dtype(dtype):
return _ensure_float64(values), 'float64', 'float64'
elif is_object_dtype(values) and dtype is None:
return _ensure_object(np.asarray(values)), 'object', 'object'
elif is_complex_dtype(values) or is_complex_dtype(dtype):

# ignore the fact that we are casting to float
# which discards complex parts
with catch_warnings(record=True):
values = _ensure_float64(values)
return values, 'float64', 'float64'

except (TypeError, ValueError):
# if we are trying to coerce to a dtype
# and it is incompat this will fall thru to here
return _ensure_object(values), 'object', 'object'

# datetimelike
if (needs_i8_conversion(values) or
is_period_dtype(dtype) or
is_datetime64_any_dtype(dtype) or
Expand Down Expand Up @@ -94,43 +124,9 @@ def _ensure_data(values, dtype=None):

return values, dtype, 'int64'

# we have failed, return object
values = np.asarray(values)

try:
if is_bool_dtype(values) or is_bool_dtype(dtype):
# we are actually coercing to uint64
# until our algos suppport uint8 directly (see TODO)
values = values.astype('uint64')
dtype = 'bool'
ndtype = 'uint64'
elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype):
values = _ensure_int64(values)
ndtype = dtype = 'int64'
elif (is_unsigned_integer_dtype(values) or
is_unsigned_integer_dtype(dtype)):
values = _ensure_uint64(values)
ndtype = dtype = 'uint64'
elif is_complex_dtype(values) or is_complex_dtype(dtype):

# ignore the fact that we are casting to float
# which discards complex parts
with catch_warnings(record=True):
values = _ensure_float64(values)
ndtype = dtype = 'float64'
elif is_float_dtype(values) or is_float_dtype(dtype):
values = _ensure_float64(values)
ndtype = dtype = 'float64'
else:
values = _ensure_object(values)
ndtype = dtype = 'object'

except (TypeError, ValueError):
# if we are trying to coerce to a dtype
# and it is incompat this will fall thru to here
values = _ensure_object(values)
ndtype = dtype = 'object'

return values, dtype, ndtype
return _ensure_object(values), 'object', 'object'


def _reconstruct_data(values, dtype, original):
Expand Down Expand Up @@ -465,7 +461,7 @@ def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
if not is_list_like(values):
raise TypeError("Only list-like objects are allowed to be passed to"
"safe_sort as values")
values = np.array(values, copy=False)
values = np.asarray(values)

def sort_mixed(values):
# order ints before strings, safe in py3
Expand Down Expand Up @@ -547,6 +543,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
PeriodIndex
"""

values = _ensure_arraylike(values)
original = values
values, dtype, _ = _ensure_data(values)
(hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)
Expand Down
50 changes: 50 additions & 0 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ def _ensure_categorical(arr):


def is_object_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
tipo = _get_dtype_type(arr_or_dtype)
return issubclass(tipo, np.object_)

Expand Down Expand Up @@ -120,6 +122,8 @@ def is_period(array):


def is_datetime64_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
try:
tipo = _get_dtype_type(arr_or_dtype)
except TypeError:
Expand All @@ -128,23 +132,33 @@ def is_datetime64_dtype(arr_or_dtype):


def is_datetime64tz_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
return DatetimeTZDtype.is_dtype(arr_or_dtype)


def is_timedelta64_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
tipo = _get_dtype_type(arr_or_dtype)
return issubclass(tipo, np.timedelta64)


def is_period_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
return PeriodDtype.is_dtype(arr_or_dtype)


def is_interval_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
return IntervalDtype.is_dtype(arr_or_dtype)


def is_categorical_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
return CategoricalDtype.is_dtype(arr_or_dtype)


Expand Down Expand Up @@ -178,6 +192,8 @@ def is_string_dtype(arr_or_dtype):

# TODO: gh-15585: consider making the checks stricter.

if arr_or_dtype is None:
return False
try:
dtype = _get_dtype(arr_or_dtype)
return dtype.kind in ('O', 'S', 'U') and not is_period_dtype(dtype)
Expand Down Expand Up @@ -224,45 +240,61 @@ def is_dtype_equal(source, target):


def is_any_int_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
tipo = _get_dtype_type(arr_or_dtype)
return issubclass(tipo, np.integer)


def is_integer_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
tipo = _get_dtype_type(arr_or_dtype)
return (issubclass(tipo, np.integer) and
not issubclass(tipo, (np.datetime64, np.timedelta64)))


def is_signed_integer_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
tipo = _get_dtype_type(arr_or_dtype)
return (issubclass(tipo, np.signedinteger) and
not issubclass(tipo, (np.datetime64, np.timedelta64)))


def is_unsigned_integer_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
tipo = _get_dtype_type(arr_or_dtype)
return (issubclass(tipo, np.unsignedinteger) and
not issubclass(tipo, (np.datetime64, np.timedelta64)))


def is_int64_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
tipo = _get_dtype_type(arr_or_dtype)
return issubclass(tipo, np.int64)


def is_int_or_datetime_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
tipo = _get_dtype_type(arr_or_dtype)
return (issubclass(tipo, np.integer) or
issubclass(tipo, (np.datetime64, np.timedelta64)))


def is_datetime64_any_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
return (is_datetime64_dtype(arr_or_dtype) or
is_datetime64tz_dtype(arr_or_dtype))


def is_datetime64_ns_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
try:
tipo = _get_dtype(arr_or_dtype)
except TypeError:
Expand Down Expand Up @@ -303,6 +335,8 @@ def is_timedelta64_ns_dtype(arr_or_dtype):
False
"""

if arr_or_dtype is None:
return False
try:
tipo = _get_dtype(arr_or_dtype)
return tipo == _TD_DTYPE
Expand All @@ -311,6 +345,8 @@ def is_timedelta64_ns_dtype(arr_or_dtype):


def is_datetime_or_timedelta_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
tipo = _get_dtype_type(arr_or_dtype)
return issubclass(tipo, (np.datetime64, np.timedelta64))

Expand Down Expand Up @@ -398,12 +434,16 @@ def is_object(x):


def needs_i8_conversion(arr_or_dtype):
if arr_or_dtype is None:
return False
return (is_datetime_or_timedelta_dtype(arr_or_dtype) or
is_datetime64tz_dtype(arr_or_dtype) or
is_period_dtype(arr_or_dtype))


def is_numeric_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
tipo = _get_dtype_type(arr_or_dtype)
return (issubclass(tipo, (np.number, np.bool_)) and
not issubclass(tipo, (np.datetime64, np.timedelta64)))
Expand Down Expand Up @@ -438,6 +478,8 @@ def is_string_like_dtype(arr_or_dtype):
False
"""

if arr_or_dtype is None:
return False
try:
dtype = _get_dtype(arr_or_dtype)
return dtype.kind in ('S', 'U')
Expand All @@ -446,16 +488,22 @@ def is_string_like_dtype(arr_or_dtype):


def is_float_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
tipo = _get_dtype_type(arr_or_dtype)
return issubclass(tipo, np.floating)


def is_floating_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
tipo = _get_dtype_type(arr_or_dtype)
return isinstance(tipo, np.floating)


def is_bool_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
try:
tipo = _get_dtype_type(arr_or_dtype)
except ValueError:
Expand All @@ -479,6 +527,8 @@ def is_extension_type(value):


def is_complex_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
tipo = _get_dtype_type(arr_or_dtype)
return issubclass(tipo, np.complexfloating)

Expand Down