diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 63df4b3d94bc8..8437861bea19e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -14,6 +14,7 @@ from pandas.core.dtypes.common import ( is_unsigned_integer_dtype, is_signed_integer_dtype, is_integer_dtype, is_complex_dtype, + is_object_dtype, is_categorical_dtype, is_sparse, is_period_dtype, is_numeric_dtype, is_float_dtype, @@ -63,6 +64,35 @@ def _ensure_data(values, dtype=None): """ + # we check some simple dtypes first + try: + if is_bool_dtype(values) or is_bool_dtype(dtype): + # we are actually coercing to uint64 + # until our algos suppport uint8 directly (see TODO) + return np.asarray(values).astype('uint64'), 'bool', 'uint64' + elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype): + return _ensure_int64(values), 'int64', 'int64' + elif (is_unsigned_integer_dtype(values) or + is_unsigned_integer_dtype(dtype)): + return _ensure_uint64(values), 'uint64', 'uint64' + elif is_float_dtype(values) or is_float_dtype(dtype): + return _ensure_float64(values), 'float64', 'float64' + elif is_object_dtype(values) and dtype is None: + return _ensure_object(np.asarray(values)), 'object', 'object' + elif is_complex_dtype(values) or is_complex_dtype(dtype): + + # ignore the fact that we are casting to float + # which discards complex parts + with catch_warnings(record=True): + values = _ensure_float64(values) + return values, 'float64', 'float64' + + except (TypeError, ValueError): + # if we are trying to coerce to a dtype + # and it is incompat this will fall thru to here + return _ensure_object(values), 'object', 'object' + + # datetimelike if (needs_i8_conversion(values) or is_period_dtype(dtype) or is_datetime64_any_dtype(dtype) or @@ -94,43 +124,9 @@ def _ensure_data(values, dtype=None): return values, dtype, 'int64' + # we have failed, return object values = np.asarray(values) - - try: - if is_bool_dtype(values) or is_bool_dtype(dtype): - # we are actually coercing to uint64 - # until our algos suppport uint8 directly (see TODO) - values = values.astype('uint64') - dtype = 'bool' - ndtype = 'uint64' - elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype): - values = _ensure_int64(values) - ndtype = dtype = 'int64' - elif (is_unsigned_integer_dtype(values) or - is_unsigned_integer_dtype(dtype)): - values = _ensure_uint64(values) - ndtype = dtype = 'uint64' - elif is_complex_dtype(values) or is_complex_dtype(dtype): - - # ignore the fact that we are casting to float - # which discards complex parts - with catch_warnings(record=True): - values = _ensure_float64(values) - ndtype = dtype = 'float64' - elif is_float_dtype(values) or is_float_dtype(dtype): - values = _ensure_float64(values) - ndtype = dtype = 'float64' - else: - values = _ensure_object(values) - ndtype = dtype = 'object' - - except (TypeError, ValueError): - # if we are trying to coerce to a dtype - # and it is incompat this will fall thru to here - values = _ensure_object(values) - ndtype = dtype = 'object' - - return values, dtype, ndtype + return _ensure_object(values), 'object', 'object' def _reconstruct_data(values, dtype, original): @@ -465,7 +461,7 @@ def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): if not is_list_like(values): raise TypeError("Only list-like objects are allowed to be passed to" "safe_sort as values") - values = np.array(values, copy=False) + values = np.asarray(values) def sort_mixed(values): # order ints before strings, safe in py3 @@ -547,6 +543,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): PeriodIndex """ + values = _ensure_arraylike(values) original = values values, dtype, _ = _ensure_data(values) (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 0b14e484d40a7..156e43fc4e5fb 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -82,6 +82,8 @@ def _ensure_categorical(arr): def is_object_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.object_) @@ -120,6 +122,8 @@ def is_period(array): def is_datetime64_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False try: tipo = _get_dtype_type(arr_or_dtype) except TypeError: @@ -128,23 +132,33 @@ def is_datetime64_dtype(arr_or_dtype): def is_datetime64tz_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False return DatetimeTZDtype.is_dtype(arr_or_dtype) def is_timedelta64_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.timedelta64) def is_period_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False return PeriodDtype.is_dtype(arr_or_dtype) def is_interval_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False return IntervalDtype.is_dtype(arr_or_dtype) def is_categorical_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False return CategoricalDtype.is_dtype(arr_or_dtype) @@ -178,6 +192,8 @@ def is_string_dtype(arr_or_dtype): # TODO: gh-15585: consider making the checks stricter. + if arr_or_dtype is None: + return False try: dtype = _get_dtype(arr_or_dtype) return dtype.kind in ('O', 'S', 'U') and not is_period_dtype(dtype) @@ -224,45 +240,61 @@ def is_dtype_equal(source, target): def is_any_int_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.integer) def is_integer_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False tipo = _get_dtype_type(arr_or_dtype) return (issubclass(tipo, np.integer) and not issubclass(tipo, (np.datetime64, np.timedelta64))) def is_signed_integer_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False tipo = _get_dtype_type(arr_or_dtype) return (issubclass(tipo, np.signedinteger) and not issubclass(tipo, (np.datetime64, np.timedelta64))) def is_unsigned_integer_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False tipo = _get_dtype_type(arr_or_dtype) return (issubclass(tipo, np.unsignedinteger) and not issubclass(tipo, (np.datetime64, np.timedelta64))) def is_int64_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.int64) def is_int_or_datetime_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False tipo = _get_dtype_type(arr_or_dtype) return (issubclass(tipo, np.integer) or issubclass(tipo, (np.datetime64, np.timedelta64))) def is_datetime64_any_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False return (is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype)) def is_datetime64_ns_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False try: tipo = _get_dtype(arr_or_dtype) except TypeError: @@ -303,6 +335,8 @@ def is_timedelta64_ns_dtype(arr_or_dtype): False """ + if arr_or_dtype is None: + return False try: tipo = _get_dtype(arr_or_dtype) return tipo == _TD_DTYPE @@ -311,6 +345,8 @@ def is_timedelta64_ns_dtype(arr_or_dtype): def is_datetime_or_timedelta_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, (np.datetime64, np.timedelta64)) @@ -398,12 +434,16 @@ def is_object(x): def needs_i8_conversion(arr_or_dtype): + if arr_or_dtype is None: + return False return (is_datetime_or_timedelta_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) or is_period_dtype(arr_or_dtype)) def is_numeric_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False tipo = _get_dtype_type(arr_or_dtype) return (issubclass(tipo, (np.number, np.bool_)) and not issubclass(tipo, (np.datetime64, np.timedelta64))) @@ -438,6 +478,8 @@ def is_string_like_dtype(arr_or_dtype): False """ + if arr_or_dtype is None: + return False try: dtype = _get_dtype(arr_or_dtype) return dtype.kind in ('S', 'U') @@ -446,16 +488,22 @@ def is_string_like_dtype(arr_or_dtype): def is_float_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.floating) def is_floating_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False tipo = _get_dtype_type(arr_or_dtype) return isinstance(tipo, np.floating) def is_bool_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False try: tipo = _get_dtype_type(arr_or_dtype) except ValueError: @@ -479,6 +527,8 @@ def is_extension_type(value): def is_complex_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.complexfloating)