From c25cad9f8ebadc3e8c4c77155d48d90bd0e64e3a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 7 Apr 2020 18:21:33 -0700 Subject: [PATCH 1/2] PERF: implement dtype-only dtype checks --- pandas/_libs/__init__.py | 2 ++ pandas/_libs/parsers.pyx | 28 +++++++++++++++------------ pandas/core/dtypes/common.py | 37 +++++++++++++++++++++++++++++++++++- 3 files changed, 54 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py index 141ca0645b906..f119e280f5867 100644 --- a/pandas/_libs/__init__.py +++ b/pandas/_libs/__init__.py @@ -6,9 +6,11 @@ "Timedelta", "Timestamp", "iNaT", + "Interval", ] +from pandas._libs.interval import Interval from pandas._libs.tslibs import ( NaT, NaTType, diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index e4aeb7ad69792..2a41039e25228 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -50,11 +50,15 @@ from pandas._libs.khash cimport ( kh_get_str_starts_item, kh_destroy_str_starts, kh_resize_str_starts) from pandas.core.dtypes.common import ( - is_categorical_dtype, - is_integer_dtype, is_float_dtype, - is_bool_dtype, is_object_dtype, - is_datetime64_dtype, - pandas_dtype, is_extension_array_dtype) + is_cat_dtype, + is_integer_dtype, + is_float_dtype, + is_bool_dtype, + is_object_dtype, + is_dt64_dtype, + pandas_dtype, + is_ea_dtype, +) from pandas.core.arrays import Categorical from pandas.core.dtypes.concat import union_categoricals import pandas.io.common as icom @@ -1064,7 +1068,7 @@ cdef class TextReader: # don't try to upcast EAs try_upcast = upcast_na and na_count > 0 - if try_upcast and not is_extension_array_dtype(col_dtype): + if try_upcast and not is_ea_dtype(col_dtype): col_res = _maybe_upcast(col_res) if col_res is None: @@ -1140,7 +1144,7 @@ cdef class TextReader: bint user_dtype, kh_str_starts_t *na_hashset, object na_flist): - if is_categorical_dtype(dtype): + if is_cat_dtype(dtype): # TODO: I suspect that _categorical_convert could be # optimized when dtype is an instance of CategoricalDtype codes, cats, na_count = _categorical_convert( @@ -1153,7 +1157,7 @@ cdef class TextReader: cats, codes, dtype, true_values=true_values) return cat, na_count - elif is_extension_array_dtype(dtype): + elif is_ea_dtype(dtype): result, na_count = self._string_convert(i, start, end, na_filter, na_hashset) array_type = dtype.construct_array_type() @@ -1223,7 +1227,7 @@ cdef class TextReader: elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, na_hashset) - elif is_datetime64_dtype(dtype): + elif is_dt64_dtype(dtype): raise TypeError(f"the dtype {dtype} is not supported " f"for parsing, pass this column " f"using parse_dates instead") @@ -2035,19 +2039,19 @@ def _concatenate_chunks(list chunks): arrs = [chunk.pop(name) for chunk in chunks] # Check each arr for consistent types. dtypes = {a.dtype for a in arrs} - numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} + numpy_dtypes = {x for x in dtypes if not is_cat_dtype(x)} if len(numpy_dtypes) > 1: common_type = np.find_common_type(numpy_dtypes, []) if common_type == np.object: warning_columns.append(str(name)) dtype = dtypes.pop() - if is_categorical_dtype(dtype): + if is_cat_dtype(dtype): sort_categories = isinstance(dtype, str) result[name] = union_categoricals(arrs, sort_categories=sort_categories) else: - if is_extension_array_dtype(dtype): + if is_ea_dtype(dtype): array_type = dtype.construct_array_type() result[name] = array_type._concat_same_type(arrs) else: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 16373bd697c1f..9b20292fea431 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -7,7 +7,7 @@ import numpy as np -from pandas._libs import algos +from pandas._libs import Interval, Period, algos from pandas._libs.tslibs import conversion from pandas._typing import ArrayLike, DtypeObj @@ -1516,6 +1516,41 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: return isinstance(dtype, ExtensionDtype) or registry.find(dtype) is not None +def is_ea_dtype(dtype) -> bool: + return isinstance(dtype, ExtensionDtype) + + +def is_dt64_dtype(dtype) -> bool: + return isinstance(dtype, np.dtype) and dtype.kind == "M" + + +def is_dt64tz_dtype(dtype) -> bool: + return isinstance(dtype, ExtensionDtype) and dtype.kind == "M" + + +def is_dt64_any_dtype(dtype) -> bool: + return isinstance(dtype, (np.dtype, ExtensionDtype)) and dtype.kind == "M" + + +def is_td64_dtype(dtype) -> bool: + return isinstance(dtype, np.dtype) and dtype.kind == "m" + + +def is_period_dtype_obj(dtype) -> bool: + return isinstance(dtype, ExtensionDtype) and dtype.type is Period + + +def is_interval_dtype_obj(dtype) -> bool: + return isinstance(dtype, ExtensionDtype) and dtype.type is Interval + + +def is_cat_dtype(dtype) -> bool: + """ + Check if we have a CategoricalDtype object. + """ + return isinstance(dtype, ExtensionDtype) and dtype.name == "category" + + def is_complex_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a complex dtype. From a56482befbc09fc50e5a82927c588a80b9f0de53 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 16 Apr 2020 13:50:44 -0700 Subject: [PATCH 2/2] remove strict versions --- pandas/_libs/parsers.pyx | 28 +++++++--------- pandas/core/dtypes/common.py | 62 ++++++++++++++++-------------------- 2 files changed, 39 insertions(+), 51 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 2a41039e25228..e4aeb7ad69792 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -50,15 +50,11 @@ from pandas._libs.khash cimport ( kh_get_str_starts_item, kh_destroy_str_starts, kh_resize_str_starts) from pandas.core.dtypes.common import ( - is_cat_dtype, - is_integer_dtype, - is_float_dtype, - is_bool_dtype, - is_object_dtype, - is_dt64_dtype, - pandas_dtype, - is_ea_dtype, -) + is_categorical_dtype, + is_integer_dtype, is_float_dtype, + is_bool_dtype, is_object_dtype, + is_datetime64_dtype, + pandas_dtype, is_extension_array_dtype) from pandas.core.arrays import Categorical from pandas.core.dtypes.concat import union_categoricals import pandas.io.common as icom @@ -1068,7 +1064,7 @@ cdef class TextReader: # don't try to upcast EAs try_upcast = upcast_na and na_count > 0 - if try_upcast and not is_ea_dtype(col_dtype): + if try_upcast and not is_extension_array_dtype(col_dtype): col_res = _maybe_upcast(col_res) if col_res is None: @@ -1144,7 +1140,7 @@ cdef class TextReader: bint user_dtype, kh_str_starts_t *na_hashset, object na_flist): - if is_cat_dtype(dtype): + if is_categorical_dtype(dtype): # TODO: I suspect that _categorical_convert could be # optimized when dtype is an instance of CategoricalDtype codes, cats, na_count = _categorical_convert( @@ -1157,7 +1153,7 @@ cdef class TextReader: cats, codes, dtype, true_values=true_values) return cat, na_count - elif is_ea_dtype(dtype): + elif is_extension_array_dtype(dtype): result, na_count = self._string_convert(i, start, end, na_filter, na_hashset) array_type = dtype.construct_array_type() @@ -1227,7 +1223,7 @@ cdef class TextReader: elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, na_hashset) - elif is_dt64_dtype(dtype): + elif is_datetime64_dtype(dtype): raise TypeError(f"the dtype {dtype} is not supported " f"for parsing, pass this column " f"using parse_dates instead") @@ -2039,19 +2035,19 @@ def _concatenate_chunks(list chunks): arrs = [chunk.pop(name) for chunk in chunks] # Check each arr for consistent types. dtypes = {a.dtype for a in arrs} - numpy_dtypes = {x for x in dtypes if not is_cat_dtype(x)} + numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} if len(numpy_dtypes) > 1: common_type = np.find_common_type(numpy_dtypes, []) if common_type == np.object: warning_columns.append(str(name)) dtype = dtypes.pop() - if is_cat_dtype(dtype): + if is_categorical_dtype(dtype): sort_categories = isinstance(dtype, str) result[name] = union_categoricals(arrs, sort_categories=sort_categories) else: - if is_ea_dtype(dtype): + if is_extension_array_dtype(dtype): array_type = dtype.construct_array_type() result[name] = array_type._concat_same_type(arrs) else: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index a31837838144b..76aefd9d5ba8f 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -396,6 +396,9 @@ def is_datetime64_dtype(arr_or_dtype) -> bool: >>> is_datetime64_dtype([1, 2, 3]) False """ + if isinstance(arr_or_dtype, np.dtype): + # GH#33400 fastpath for dtype object + return arr_or_dtype.kind == "M" return _is_dtype_type(arr_or_dtype, classes(np.datetime64)) @@ -431,6 +434,10 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: >>> is_datetime64tz_dtype(s) True """ + if isinstance(arr_or_dtype, ExtensionDtype): + # GH#33400 fastpath for dtype object + return arr_or_dtype.kind == "M" + if arr_or_dtype is None: return False return DatetimeTZDtype.is_dtype(arr_or_dtype) @@ -463,6 +470,10 @@ def is_timedelta64_dtype(arr_or_dtype) -> bool: >>> is_timedelta64_dtype('0 days') False """ + if isinstance(arr_or_dtype, np.dtype): + # GH#33400 fastpath for dtype object + return arr_or_dtype.kind == "m" + return _is_dtype_type(arr_or_dtype, classes(np.timedelta64)) @@ -493,6 +504,10 @@ def is_period_dtype(arr_or_dtype) -> bool: >>> is_period_dtype(pd.PeriodIndex([], freq="A")) True """ + if isinstance(arr_or_dtype, ExtensionDtype): + # GH#33400 fastpath for dtype object + return arr_or_dtype.type is Period + # TODO: Consider making Period an instance of PeriodDtype if arr_or_dtype is None: return False @@ -528,6 +543,10 @@ def is_interval_dtype(arr_or_dtype) -> bool: >>> is_interval_dtype(pd.IntervalIndex([interval])) True """ + if isinstance(arr_or_dtype, ExtensionDtype): + # GH#33400 fastpath for dtype object + return arr_or_dtype.type is Interval + # TODO: Consider making Interval an instance of IntervalDtype if arr_or_dtype is None: return False @@ -561,6 +580,10 @@ def is_categorical_dtype(arr_or_dtype) -> bool: >>> is_categorical_dtype(pd.CategoricalIndex([1, 2, 3])) True """ + if isinstance(arr_or_dtype, ExtensionDtype): + # GH#33400 fastpath for dtype object + return arr_or_dtype.name == "category" + if arr_or_dtype is None: return False return CategoricalDtype.is_dtype(arr_or_dtype) @@ -938,6 +961,10 @@ def is_datetime64_any_dtype(arr_or_dtype) -> bool: >>> is_datetime64_any_dtype(pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]")) True """ + if isinstance(arr_or_dtype, (np.dtype, ExtensionDtype)): + # GH#33400 fastpath for dtype object + return arr_or_dtype.kind == "M" + if arr_or_dtype is None: return False return is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) @@ -1505,41 +1532,6 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: return isinstance(dtype, ExtensionDtype) or registry.find(dtype) is not None -def is_ea_dtype(dtype) -> bool: - return isinstance(dtype, ExtensionDtype) - - -def is_dt64_dtype(dtype) -> bool: - return isinstance(dtype, np.dtype) and dtype.kind == "M" - - -def is_dt64tz_dtype(dtype) -> bool: - return isinstance(dtype, ExtensionDtype) and dtype.kind == "M" - - -def is_dt64_any_dtype(dtype) -> bool: - return isinstance(dtype, (np.dtype, ExtensionDtype)) and dtype.kind == "M" - - -def is_td64_dtype(dtype) -> bool: - return isinstance(dtype, np.dtype) and dtype.kind == "m" - - -def is_period_dtype_obj(dtype) -> bool: - return isinstance(dtype, ExtensionDtype) and dtype.type is Period - - -def is_interval_dtype_obj(dtype) -> bool: - return isinstance(dtype, ExtensionDtype) and dtype.type is Interval - - -def is_cat_dtype(dtype) -> bool: - """ - Check if we have a CategoricalDtype object. - """ - return isinstance(dtype, ExtensionDtype) and dtype.name == "category" - - def is_complex_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a complex dtype.