diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index 4f7cc9345ed30..fdeff2ed11805 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -1,7 +1,7 @@ -from pandas._libs.util cimport numeric +from pandas._libs.dtypes cimport numeric_t -cdef numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nogil +cdef numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) nogil cdef enum TiebreakEnumType: TIEBREAK_AVERAGE diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 22e2abc9b9c36..82f9280870d59 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -45,7 +45,11 @@ from numpy cimport ( cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.dtypes cimport numeric_object_t +from pandas._libs.dtypes cimport ( + iu_64_floating_obj_t, + numeric_object_t, + numeric_t, +) from pandas._libs.khash cimport ( kh_destroy_int64, kh_get_int64, @@ -55,10 +59,7 @@ from pandas._libs.khash cimport ( kh_resize_int64, khiter_t, ) -from pandas._libs.util cimport ( - get_nat, - numeric, -) +from pandas._libs.util cimport get_nat import pandas._libs.missing as missing @@ -240,9 +241,9 @@ def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups): return indexer.base, counts.base -cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: +cdef inline Py_ssize_t swap(numeric_t *a, numeric_t *b) nogil: cdef: - numeric t + numeric_t t # cython doesn't allow pointer dereference so use array syntax t = a[0] @@ -251,7 +252,7 @@ cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: return 0 -cdef inline numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nogil: +cdef inline numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) nogil: """ See kth_smallest.__doc__. The additional parameter n specifies the maximum number of elements considered in arr, needed for compatibility with usage @@ -259,7 +260,7 @@ cdef inline numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nog """ cdef: Py_ssize_t i, j, l, m - numeric x + numeric_t x l = 0 m = n - 1 @@ -291,7 +292,7 @@ cdef inline numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nog @cython.boundscheck(False) @cython.wraparound(False) -def kth_smallest(numeric[::1] arr, Py_ssize_t k) -> numeric: +def kth_smallest(numeric_t[::1] arr, Py_ssize_t k) -> numeric_t: """ Compute the kth smallest value in arr. Note that the input array will be modified. @@ -309,7 +310,7 @@ def kth_smallest(numeric[::1] arr, Py_ssize_t k) -> numeric: The kth smallest value in arr """ cdef: - numeric result + numeric_t result with nogil: result = kth_smallest_c(&arr[0], k, arr.shape[0]) @@ -514,20 +515,6 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr # ---------------------------------------------------------------------- -ctypedef fused algos_t: - float64_t - float32_t - object - int64_t - int32_t - int16_t - int8_t - uint64_t - uint32_t - uint16_t - uint8_t - - def validate_limit(nobs: int | None, limit=None) -> int: """ Check that the `limit` argument is a positive integer. @@ -556,12 +543,16 @@ def validate_limit(nobs: int | None, limit=None) -> int: @cython.boundscheck(False) @cython.wraparound(False) -def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: +def pad( + ndarray[numeric_object_t] old, + ndarray[numeric_object_t] new, + limit=None +) -> ndarray: # -> ndarray[intp_t, ndim=1] cdef: Py_ssize_t i, j, nleft, nright ndarray[intp_t, ndim=1] indexer - algos_t cur, next_val + numeric_object_t cur, next_val int lim, fill_count = 0 nleft = len(old) @@ -614,10 +605,10 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: @cython.boundscheck(False) @cython.wraparound(False) -def pad_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): +def pad_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None): cdef: Py_ssize_t i, N - algos_t val + numeric_object_t val uint8_t prev_mask int lim, fill_count = 0 @@ -646,10 +637,10 @@ def pad_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): @cython.boundscheck(False) @cython.wraparound(False) -def pad_2d_inplace(algos_t[:, :] values, const uint8_t[:, :] mask, limit=None): +def pad_2d_inplace(numeric_object_t[:, :] values, const uint8_t[:, :] mask, limit=None): cdef: Py_ssize_t i, j, N, K - algos_t val + numeric_object_t val int lim, fill_count = 0 K, N = (values).shape @@ -702,12 +693,16 @@ D @cython.boundscheck(False) @cython.wraparound(False) -def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: +def backfill( + ndarray[numeric_object_t] old, + ndarray[numeric_object_t] new, + limit=None +) -> ndarray: # -> ndarray[intp_t, ndim=1] cdef: Py_ssize_t i, j, nleft, nright ndarray[intp_t, ndim=1] indexer - algos_t cur, prev + numeric_object_t cur, prev int lim, fill_count = 0 nleft = len(old) @@ -759,11 +754,11 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: return indexer -def backfill_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): +def backfill_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None): pad_inplace(values[::-1], mask[::-1], limit=limit) -def backfill_2d_inplace(algos_t[:, :] values, +def backfill_2d_inplace(numeric_object_t[:, :] values, const uint8_t[:, :] mask, limit=None): pad_2d_inplace(values[:, ::-1], mask[:, ::-1], limit) @@ -771,7 +766,7 @@ def backfill_2d_inplace(algos_t[:, :] values, @cython.boundscheck(False) @cython.wraparound(False) -def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): +def is_monotonic(ndarray[numeric_object_t, ndim=1] arr, bint timelike): """ Returns ------- @@ -782,7 +777,7 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): """ cdef: Py_ssize_t i, n - algos_t prev, cur + numeric_object_t prev, cur bint is_monotonic_inc = 1 bint is_monotonic_dec = 1 bint is_unique = 1 @@ -802,7 +797,7 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): if timelike and arr[0] == NPY_NAT: return False, False, True - if algos_t is not object: + if numeric_object_t is not object: with nogil: prev = arr[0] for i in range(1, n): @@ -861,9 +856,9 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): # rank_1d, rank_2d # ---------------------------------------------------------------------- -cdef numeric_object_t get_rank_nan_fill_val( +cdef iu_64_floating_obj_t get_rank_nan_fill_val( bint rank_nans_highest, - numeric_object_t[:] _=None + iu_64_floating_obj_t[:] _=None ): """ Return the value we'll use to represent missing values when sorting depending @@ -871,20 +866,20 @@ cdef numeric_object_t get_rank_nan_fill_val( is unused, but needed for fused type specialization) """ if rank_nans_highest: - if numeric_object_t is object: + if iu_64_floating_obj_t is object: return Infinity() - elif numeric_object_t is int64_t: + elif iu_64_floating_obj_t is int64_t: return util.INT64_MAX - elif numeric_object_t is uint64_t: + elif iu_64_floating_obj_t is uint64_t: return util.UINT64_MAX else: return np.inf else: - if numeric_object_t is object: + if iu_64_floating_obj_t is object: return NegInfinity() - elif numeric_object_t is int64_t: + elif iu_64_floating_obj_t is int64_t: return NPY_NAT - elif numeric_object_t is uint64_t: + elif iu_64_floating_obj_t is uint64_t: return 0 else: return -np.inf @@ -893,7 +888,7 @@ cdef numeric_object_t get_rank_nan_fill_val( @cython.wraparound(False) @cython.boundscheck(False) def rank_1d( - ndarray[numeric_object_t, ndim=1] values, + ndarray[iu_64_floating_obj_t, ndim=1] values, const intp_t[:] labels=None, bint is_datetimelike=False, ties_method="average", @@ -906,7 +901,7 @@ def rank_1d( Parameters ---------- - values : array of numeric_object_t values to be ranked + values : array of iu_64_floating_obj_t values to be ranked labels : np.ndarray[np.intp] or None Array containing unique label for each group, with its ordering matching up to the corresponding record in `values`. If not called @@ -936,11 +931,11 @@ def rank_1d( int64_t[::1] grp_sizes intp_t[:] lexsort_indexer float64_t[::1] out - ndarray[numeric_object_t, ndim=1] masked_vals - numeric_object_t[:] masked_vals_memview + ndarray[iu_64_floating_obj_t, ndim=1] masked_vals + iu_64_floating_obj_t[:] masked_vals_memview uint8_t[:] mask bint keep_na, nans_rank_highest, check_labels, check_mask - numeric_object_t nan_fill_val + iu_64_floating_obj_t nan_fill_val tiebreak = tiebreakers[ties_method] if tiebreak == TIEBREAK_FIRST: @@ -961,22 +956,22 @@ def rank_1d( check_labels = labels is not None # For cases where a mask is not possible, we can avoid mask checks - check_mask = not (numeric_object_t is uint64_t or - (numeric_object_t is int64_t and not is_datetimelike)) + check_mask = not (iu_64_floating_obj_t is uint64_t or + (iu_64_floating_obj_t is int64_t and not is_datetimelike)) # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data # in values array - if numeric_object_t is object and values.dtype != np.object_: + if iu_64_floating_obj_t is object and values.dtype != np.object_: masked_vals = values.astype('O') else: masked_vals = values.copy() - if numeric_object_t is object: + if iu_64_floating_obj_t is object: mask = missing.isnaobj(masked_vals) - elif numeric_object_t is int64_t and is_datetimelike: + elif iu_64_floating_obj_t is int64_t and is_datetimelike: mask = (masked_vals == NPY_NAT).astype(np.uint8) - elif numeric_object_t is float64_t: + elif iu_64_floating_obj_t is float64_t: mask = np.isnan(masked_vals).astype(np.uint8) else: mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) @@ -988,7 +983,7 @@ def rank_1d( # will flip the ordering to still end up with lowest rank. # Symmetric logic applies to `na_option == 'bottom'` nans_rank_highest = ascending ^ (na_option == 'top') - nan_fill_val = get_rank_nan_fill_val[numeric_object_t](nans_rank_highest) + nan_fill_val = get_rank_nan_fill_val[iu_64_floating_obj_t](nans_rank_highest) if nans_rank_highest: order = [masked_vals, mask] else: @@ -1035,7 +1030,7 @@ cdef void rank_sorted_1d( int64_t[::1] grp_sizes, const intp_t[:] sort_indexer, # Can make const with cython3 (https://github.com/cython/cython/issues/3222) - numeric_object_t[:] masked_vals, + iu_64_floating_obj_t[:] masked_vals, const uint8_t[:] mask, bint check_mask, Py_ssize_t N, @@ -1059,7 +1054,7 @@ cdef void rank_sorted_1d( if labels is None. sort_indexer : intp_t[:] Array of indices which sorts masked_vals - masked_vals : numeric_object_t[:] + masked_vals : iu_64_floating_obj_t[:] The values input to rank_1d, with missing values replaced by fill values mask : uint8_t[:] Array where entries are True if the value is missing, False otherwise. @@ -1091,7 +1086,7 @@ cdef void rank_sorted_1d( # that sorted value for retrieval back from the original # values / masked_vals arrays # TODO: de-duplicate once cython supports conditional nogil - if numeric_object_t is object: + if iu_64_floating_obj_t is object: with gil: for i in range(N): at_end = i == N - 1 @@ -1299,7 +1294,7 @@ cdef void rank_sorted_1d( def rank_2d( - ndarray[numeric_object_t, ndim=2] in_arr, + ndarray[iu_64_floating_obj_t, ndim=2] in_arr, int axis=0, bint is_datetimelike=False, ties_method="average", @@ -1314,13 +1309,13 @@ def rank_2d( Py_ssize_t k, n, col float64_t[::1, :] out # Column-major so columns are contiguous int64_t[::1] grp_sizes - ndarray[numeric_object_t, ndim=2] values - numeric_object_t[:, :] masked_vals + ndarray[iu_64_floating_obj_t, ndim=2] values + iu_64_floating_obj_t[:, :] masked_vals intp_t[:, :] sort_indexer uint8_t[:, :] mask TiebreakEnumType tiebreak bint check_mask, keep_na, nans_rank_highest - numeric_object_t nan_fill_val + iu_64_floating_obj_t nan_fill_val tiebreak = tiebreakers[ties_method] if tiebreak == TIEBREAK_FIRST: @@ -1330,25 +1325,25 @@ def rank_2d( keep_na = na_option == 'keep' # For cases where a mask is not possible, we can avoid mask checks - check_mask = not (numeric_object_t is uint64_t or - (numeric_object_t is int64_t and not is_datetimelike)) + check_mask = not (iu_64_floating_obj_t is uint64_t or + (iu_64_floating_obj_t is int64_t and not is_datetimelike)) if axis == 1: values = np.asarray(in_arr).T.copy() else: values = np.asarray(in_arr).copy() - if numeric_object_t is object: + if iu_64_floating_obj_t is object: if values.dtype != np.object_: values = values.astype('O') nans_rank_highest = ascending ^ (na_option == 'top') if check_mask: - nan_fill_val = get_rank_nan_fill_val[numeric_object_t](nans_rank_highest) + nan_fill_val = get_rank_nan_fill_val[iu_64_floating_obj_t](nans_rank_highest) - if numeric_object_t is object: + if iu_64_floating_obj_t is object: mask = missing.isnaobj2d(values).view(np.uint8) - elif numeric_object_t is float64_t: + elif iu_64_floating_obj_t is float64_t: mask = np.isnan(values).view(np.uint8) # int64 and datetimelike diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd index ef95b8aab6e70..f87a1525b15fd 100644 --- a/pandas/_libs/dtypes.pxd +++ b/pandas/_libs/dtypes.pxd @@ -5,13 +5,44 @@ Common location for shared fused types from numpy cimport ( float32_t, float64_t, + int8_t, + int16_t, + int32_t, int64_t, + uint8_t, + uint16_t, + uint32_t, uint64_t, ) +# All numeric types except complex +ctypedef fused numeric_t: + int8_t + int16_t + int32_t + int64_t + + uint8_t + uint16_t + uint32_t + uint64_t + + float32_t + float64_t + +# All numeric types + object, doesn't include complex ctypedef fused numeric_object_t: + numeric_t + object + +# i64 + u64 + all float types +ctypedef fused iu_64_floating_t: float64_t float32_t int64_t uint64_t + +# i64 + u64 + all float types + object +ctypedef fused iu_64_floating_obj_t: + iu_64_floating_t object diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 6dfed95e7afb6..1e05ef443d516 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -31,10 +31,7 @@ from numpy.math cimport NAN cnp.import_array() from pandas._libs.algos cimport kth_smallest_c -from pandas._libs.util cimport ( - get_nat, - numeric, -) +from pandas._libs.util cimport get_nat from pandas._libs.algos import ( ensure_platform_int, @@ -43,7 +40,11 @@ from pandas._libs.algos import ( take_2d_axis1_float64_float64, ) -from pandas._libs.dtypes cimport numeric_object_t +from pandas._libs.dtypes cimport ( + iu_64_floating_obj_t, + iu_64_floating_t, + numeric_t, +) from pandas._libs.missing cimport checknull @@ -201,8 +202,8 @@ def group_cumprod_float64(float64_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_cumsum(numeric[:, ::1] out, - ndarray[numeric, ndim=2] values, +def group_cumsum(numeric_t[:, ::1] out, + ndarray[numeric_t, ndim=2] values, const intp_t[::1] labels, int ngroups, is_datetimelike, @@ -231,8 +232,8 @@ def group_cumsum(numeric[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, size - numeric val, y, t - numeric[:, ::1] accum, compensation + numeric_t val, y, t + numeric_t[:, ::1] accum, compensation intp_t lab N, K = (values).shape @@ -250,7 +251,7 @@ def group_cumsum(numeric[:, ::1] out, # For floats, use Kahan summation to reduce floating-point # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm) - if numeric == float32_t or numeric == float64_t: + if numeric_t == float32_t or numeric_t == float64_t: if val == val: y = val - compensation[lab, j] t = accum[lab, j] + y @@ -812,7 +813,7 @@ def group_ohlc(floating[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) def group_quantile(ndarray[float64_t, ndim=2] out, - ndarray[numeric, ndim=1] values, + ndarray[numeric_t, ndim=1] values, ndarray[intp_t] labels, ndarray[uint8_t] mask, const intp_t[:] sort_indexer, @@ -928,15 +929,15 @@ def group_quantile(ndarray[float64_t, ndim=2] out, # group_nth, group_last, group_rank # ---------------------------------------------------------------------- -cdef inline bint _treat_as_na(numeric_object_t val, bint is_datetimelike) nogil: - if numeric_object_t is object: +cdef inline bint _treat_as_na(iu_64_floating_obj_t val, bint is_datetimelike) nogil: + if iu_64_floating_obj_t is object: # Should never be used, but we need to avoid the `val != val` below # or else cython will raise about gil acquisition. raise NotImplementedError - elif numeric_object_t is int64_t: + elif iu_64_floating_obj_t is int64_t: return is_datetimelike and val == NPY_NAT - elif numeric_object_t is uint64_t: + elif iu_64_floating_obj_t is uint64_t: # There is no NA value for uint64 return False else: @@ -944,12 +945,12 @@ cdef inline bint _treat_as_na(numeric_object_t val, bint is_datetimelike) nogil: # GH#31710 use memorviews once cython 0.30 is released so we can -# use `const numeric_object_t[:, :] values` +# use `const iu_64_floating_obj_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) -def group_last(numeric_object_t[:, ::1] out, +def group_last(iu_64_floating_obj_t[:, ::1] out, int64_t[::1] counts, - ndarray[numeric_object_t, ndim=2] values, + ndarray[iu_64_floating_obj_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1) -> None: """ @@ -957,8 +958,8 @@ def group_last(numeric_object_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - numeric_object_t val - ndarray[numeric_object_t, ndim=2] resx + iu_64_floating_obj_t val + ndarray[iu_64_floating_obj_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs bint runtime_error = False @@ -969,14 +970,14 @@ def group_last(numeric_object_t[:, ::1] out, min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) - if numeric_object_t is object: + if iu_64_floating_obj_t is object: resx = np.empty((out).shape, dtype=object) else: resx = np.empty_like(out) N, K = (values).shape - if numeric_object_t is object: + if iu_64_floating_obj_t is object: # TODO: De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] @@ -1018,9 +1019,9 @@ def group_last(numeric_object_t[:, ::1] out, for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - if numeric_object_t is int64_t: + if iu_64_floating_obj_t is int64_t: out[i, j] = NPY_NAT - elif numeric_object_t is uint64_t: + elif iu_64_floating_obj_t is uint64_t: runtime_error = True break else: @@ -1036,12 +1037,12 @@ def group_last(numeric_object_t[:, ::1] out, # GH#31710 use memorviews once cython 0.30 is released so we can -# use `const numeric_object_t[:, :] values` +# use `const iu_64_floating_obj_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) -def group_nth(numeric_object_t[:, ::1] out, +def group_nth(iu_64_floating_obj_t[:, ::1] out, int64_t[::1] counts, - ndarray[numeric_object_t, ndim=2] values, + ndarray[iu_64_floating_obj_t, ndim=2] values, const intp_t[::1] labels, int64_t min_count=-1, int64_t rank=1, @@ -1051,8 +1052,8 @@ def group_nth(numeric_object_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - numeric_object_t val - ndarray[numeric_object_t, ndim=2] resx + iu_64_floating_obj_t val + ndarray[iu_64_floating_obj_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs bint runtime_error = False @@ -1063,14 +1064,14 @@ def group_nth(numeric_object_t[:, ::1] out, min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) - if numeric_object_t is object: + if iu_64_floating_obj_t is object: resx = np.empty((out).shape, dtype=object) else: resx = np.empty_like(out) N, K = (values).shape - if numeric_object_t is object: + if iu_64_floating_obj_t is object: # TODO: De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] @@ -1115,9 +1116,9 @@ def group_nth(numeric_object_t[:, ::1] out, for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - if numeric_object_t is int64_t: + if iu_64_floating_obj_t is int64_t: out[i, j] = NPY_NAT - elif numeric_object_t is uint64_t: + elif iu_64_floating_obj_t is uint64_t: runtime_error = True break else: @@ -1134,7 +1135,7 @@ def group_nth(numeric_object_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) def group_rank(float64_t[:, ::1] out, - ndarray[numeric_object_t, ndim=2] values, + ndarray[iu_64_floating_obj_t, ndim=2] values, const intp_t[::1] labels, int ngroups, bint is_datetimelike, str ties_method="average", @@ -1146,7 +1147,7 @@ def group_rank(float64_t[:, ::1] out, ---------- out : np.ndarray[np.float64, ndim=2] Values to which this method will write its results. - values : np.ndarray of numeric_object_t values to be ranked + values : np.ndarray of iu_64_floating_obj_t values to be ranked labels : np.ndarray[np.intp] Array containing unique label for each group, with its ordering matching up to the corresponding record in `values` @@ -1201,18 +1202,12 @@ def group_rank(float64_t[:, ::1] out, # ---------------------------------------------------------------------- # TODO: consider implementing for more dtypes -ctypedef fused groupby_t: - float64_t - float32_t - int64_t - uint64_t - @cython.wraparound(False) @cython.boundscheck(False) -cdef group_min_max(groupby_t[:, ::1] out, +cdef group_min_max(iu_64_floating_t[:, ::1] out, int64_t[::1] counts, - ndarray[groupby_t, ndim=2] values, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, @@ -1224,7 +1219,7 @@ cdef group_min_max(groupby_t[:, ::1] out, Parameters ---------- - out : np.ndarray[groupby_t, ndim=2] + out : np.ndarray[iu_64_floating_t, ndim=2] Array to store result in. counts : np.ndarray[int64] Input as a zeroed array, populated by group sizes during algorithm @@ -1253,8 +1248,8 @@ cdef group_min_max(groupby_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, lab, ngroups = len(counts) - groupby_t val, nan_val - ndarray[groupby_t, ndim=2] group_min_or_max + iu_64_floating_t val, nan_val + ndarray[iu_64_floating_t, ndim=2] group_min_or_max bint runtime_error = False int64_t[:, ::1] nobs bint uses_mask = mask is not None @@ -1269,10 +1264,10 @@ cdef group_min_max(groupby_t[:, ::1] out, nobs = np.zeros((out).shape, dtype=np.int64) group_min_or_max = np.empty_like(out) - if groupby_t is int64_t: + if iu_64_floating_t is int64_t: group_min_or_max[:] = -_int64_max if compute_max else _int64_max nan_val = NPY_NAT - elif groupby_t is uint64_t: + elif iu_64_floating_t is uint64_t: # NB: We do not define nan_val because there is no such thing # for uint64_t. We carefully avoid having to reference it in this # case. @@ -1310,7 +1305,7 @@ cdef group_min_max(groupby_t[:, ::1] out, for i in range(ngroups): for j in range(K): if nobs[i, j] < min_count: - if groupby_t is uint64_t: + if iu_64_floating_t is uint64_t: runtime_error = True break else: @@ -1329,9 +1324,9 @@ cdef group_min_max(groupby_t[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_max(groupby_t[:, ::1] out, +def group_max(iu_64_floating_t[:, ::1] out, int64_t[::1] counts, - ndarray[groupby_t, ndim=2] values, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, @@ -1353,9 +1348,9 @@ def group_max(groupby_t[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_min(groupby_t[:, ::1] out, +def group_min(iu_64_floating_t[:, ::1] out, int64_t[::1] counts, - ndarray[groupby_t, ndim=2] values, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, @@ -1377,8 +1372,8 @@ def group_min(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -cdef group_cummin_max(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, +cdef group_cummin_max(iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, uint8_t[:, ::1] mask, const intp_t[::1] labels, int ngroups, @@ -1390,9 +1385,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, Parameters ---------- - out : np.ndarray[groupby_t, ndim=2] + out : np.ndarray[iu_64_floating_t, ndim=2] Array to store cummin/max in. - values : np.ndarray[groupby_t, ndim=2] + values : np.ndarray[iu_64_floating_t, ndim=2] Values to take cummin/max of. mask : np.ndarray[bool] or None If not None, indices represent missing values, @@ -1414,12 +1409,12 @@ cdef group_cummin_max(groupby_t[:, ::1] out, This method modifies the `out` parameter, rather than returning an object. """ cdef: - groupby_t[:, ::1] accum + iu_64_floating_t[:, ::1] accum accum = np.empty((ngroups, (values).shape[1]), dtype=values.dtype) - if groupby_t is int64_t: + if iu_64_floating_t is int64_t: accum[:] = -_int64_max if compute_max else _int64_max - elif groupby_t is uint64_t: + elif iu_64_floating_t is uint64_t: accum[:] = 0 if compute_max else np.iinfo(np.uint64).max else: accum[:] = -np.inf if compute_max else np.inf @@ -1432,10 +1427,10 @@ cdef group_cummin_max(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -cdef cummin_max(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, +cdef cummin_max(iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, - groupby_t[:, ::1] accum, + iu_64_floating_t[:, ::1] accum, bint skipna, bint is_datetimelike, bint compute_max): @@ -1445,12 +1440,12 @@ cdef cummin_max(groupby_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K - groupby_t val, mval, na_val + iu_64_floating_t val, mval, na_val uint8_t[:, ::1] seen_na intp_t lab bint na_possible - if groupby_t is float64_t or groupby_t is float32_t: + if iu_64_floating_t is float64_t or iu_64_floating_t is float32_t: na_val = NaN na_possible = True elif is_datetimelike: @@ -1491,11 +1486,11 @@ cdef cummin_max(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -cdef masked_cummin_max(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, +cdef masked_cummin_max(iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, uint8_t[:, ::1] mask, const intp_t[::1] labels, - groupby_t[:, ::1] accum, + iu_64_floating_t[:, ::1] accum, bint skipna, bint compute_max): """ @@ -1504,7 +1499,7 @@ cdef masked_cummin_max(groupby_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K - groupby_t val, mval + iu_64_floating_t val, mval uint8_t[:, ::1] seen_na intp_t lab @@ -1535,8 +1530,8 @@ cdef masked_cummin_max(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_cummin(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, +def group_cummin(iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, int ngroups, bint is_datetimelike, @@ -1557,8 +1552,8 @@ def group_cummin(groupby_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_cummax(groupby_t[:, ::1] out, - ndarray[groupby_t, ndim=2] values, +def group_cummax(iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, const intp_t[::1] labels, int ngroups, bint is_datetimelike, diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index b6acf8914c0a6..c9a4b49f90037 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -4,17 +4,9 @@ import numpy as np cimport numpy as cnp from numpy cimport ( - float32_t, - float64_t, - int8_t, - int16_t, - int32_t, int64_t, intp_t, ndarray, - uint8_t, - uint16_t, - uint32_t, uint64_t, ) @@ -22,6 +14,11 @@ cnp.import_array() from pandas._libs.algos import groupsort_indexer +from pandas._libs.dtypes cimport ( + numeric_object_t, + numeric_t, +) + @cython.wraparound(False) @cython.boundscheck(False) @@ -257,31 +254,20 @@ def ffill_indexer(const intp_t[:] indexer) -> np.ndarray: # left_join_indexer, inner_join_indexer, outer_join_indexer # ---------------------------------------------------------------------- -ctypedef fused join_t: - float64_t - float32_t - object - int8_t - int16_t - int32_t - int64_t - uint8_t - uint16_t - uint32_t - uint64_t - - # Joins on ordered, unique indices # right might contain non-unique values @cython.wraparound(False) @cython.boundscheck(False) -def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right): +def left_join_indexer_unique( + ndarray[numeric_object_t] left, + ndarray[numeric_object_t] right +): cdef: Py_ssize_t i, j, nleft, nright ndarray[intp_t] indexer - join_t lval, rval + numeric_object_t lval, rval i = 0 j = 0 @@ -322,15 +308,15 @@ def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right): @cython.wraparound(False) @cython.boundscheck(False) -def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right): +def left_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right): """ Two-pass algorithm for monotonic indexes. Handles many-to-one merges. """ cdef: Py_ssize_t i, j, k, nright, nleft, count - join_t lval, rval + numeric_object_t lval, rval ndarray[intp_t] lindexer, rindexer - ndarray[join_t] result + ndarray[numeric_object_t] result nleft = len(left) nright = len(right) @@ -425,15 +411,15 @@ def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right): @cython.wraparound(False) @cython.boundscheck(False) -def inner_join_indexer(ndarray[join_t] left, ndarray[join_t] right): +def inner_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right): """ Two-pass algorithm for monotonic indexes. Handles many-to-one merges. """ cdef: Py_ssize_t i, j, k, nright, nleft, count - join_t lval, rval + numeric_object_t lval, rval ndarray[intp_t] lindexer, rindexer - ndarray[join_t] result + ndarray[numeric_object_t] result nleft = len(left) nright = len(right) @@ -518,12 +504,12 @@ def inner_join_indexer(ndarray[join_t] left, ndarray[join_t] right): @cython.wraparound(False) @cython.boundscheck(False) -def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): +def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right): cdef: Py_ssize_t i, j, nright, nleft, count - join_t lval, rval + numeric_object_t lval, rval ndarray[intp_t] lindexer, rindexer - ndarray[join_t] result + ndarray[numeric_object_t] result nleft = len(left) nright = len(right) @@ -656,26 +642,14 @@ from pandas._libs.hashtable cimport ( UInt64HashTable, ) -ctypedef fused asof_t: - uint8_t - uint16_t - uint32_t - uint64_t - int8_t - int16_t - int32_t - int64_t - float - float64_t - ctypedef fused by_t: object int64_t uint64_t -def asof_join_backward_on_X_by_Y(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_backward_on_X_by_Y(numeric_t[:] left_values, + numeric_t[:] right_values, by_t[:] left_by_values, by_t[:] right_by_values, bint allow_exact_matches=True, @@ -685,8 +659,8 @@ def asof_join_backward_on_X_by_Y(asof_t[:] left_values, Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False - asof_t tolerance_ = 0 - asof_t diff = 0 + numeric_t tolerance_ = 0 + numeric_t diff = 0 HashTable hash_table by_t by_value @@ -743,8 +717,8 @@ def asof_join_backward_on_X_by_Y(asof_t[:] left_values, return left_indexer, right_indexer -def asof_join_forward_on_X_by_Y(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_forward_on_X_by_Y(numeric_t[:] left_values, + numeric_t[:] right_values, by_t[:] left_by_values, by_t[:] right_by_values, bint allow_exact_matches=1, @@ -754,8 +728,8 @@ def asof_join_forward_on_X_by_Y(asof_t[:] left_values, Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False - asof_t tolerance_ = 0 - asof_t diff = 0 + numeric_t tolerance_ = 0 + numeric_t diff = 0 HashTable hash_table by_t by_value @@ -812,8 +786,8 @@ def asof_join_forward_on_X_by_Y(asof_t[:] left_values, return left_indexer, right_indexer -def asof_join_nearest_on_X_by_Y(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_nearest_on_X_by_Y(numeric_t[:] left_values, + numeric_t[:] right_values, by_t[:] left_by_values, by_t[:] right_by_values, bint allow_exact_matches=True, @@ -822,7 +796,7 @@ def asof_join_nearest_on_X_by_Y(asof_t[:] left_values, cdef: Py_ssize_t left_size, right_size, i ndarray[intp_t] left_indexer, right_indexer, bli, bri, fli, fri - asof_t bdiff, fdiff + numeric_t bdiff, fdiff left_size = len(left_values) right_size = len(right_values) @@ -865,8 +839,8 @@ def asof_join_nearest_on_X_by_Y(asof_t[:] left_values, # asof_join # ---------------------------------------------------------------------- -def asof_join_backward(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_backward(numeric_t[:] left_values, + numeric_t[:] right_values, bint allow_exact_matches=True, tolerance=None): @@ -874,8 +848,8 @@ def asof_join_backward(asof_t[:] left_values, Py_ssize_t left_pos, right_pos, left_size, right_size ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False - asof_t tolerance_ = 0 - asof_t diff = 0 + numeric_t tolerance_ = 0 + numeric_t diff = 0 # if we are using tolerance, set our objects if tolerance is not None: @@ -918,8 +892,8 @@ def asof_join_backward(asof_t[:] left_values, return left_indexer, right_indexer -def asof_join_forward(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_forward(numeric_t[:] left_values, + numeric_t[:] right_values, bint allow_exact_matches=True, tolerance=None): @@ -927,8 +901,8 @@ def asof_join_forward(asof_t[:] left_values, Py_ssize_t left_pos, right_pos, left_size, right_size ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False - asof_t tolerance_ = 0 - asof_t diff = 0 + numeric_t tolerance_ = 0 + numeric_t diff = 0 # if we are using tolerance, set our objects if tolerance is not None: @@ -972,15 +946,15 @@ def asof_join_forward(asof_t[:] left_values, return left_indexer, right_indexer -def asof_join_nearest(asof_t[:] left_values, - asof_t[:] right_values, +def asof_join_nearest(numeric_t[:] left_values, + numeric_t[:] right_values, bint allow_exact_matches=True, tolerance=None): cdef: Py_ssize_t left_size, right_size, i ndarray[intp_t] left_indexer, right_indexer, bli, bri, fli, fri - asof_t bdiff, fdiff + numeric_t bdiff, fdiff left_size = len(left_values) right_size = len(right_values) diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 959d83a55d4f3..9d3b80b321537 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -2,17 +2,9 @@ import cython from cython import Py_ssize_t from numpy cimport ( - float32_t, - float64_t, - int8_t, - int16_t, - int32_t, int64_t, ndarray, uint8_t, - uint16_t, - uint32_t, - uint64_t, ) import numpy as np @@ -21,27 +13,15 @@ cimport numpy as cnp cnp.import_array() +from pandas._libs.dtypes cimport numeric_object_t from pandas._libs.lib cimport c_is_list_like -ctypedef fused reshape_t: - uint8_t - uint16_t - uint32_t - uint64_t - int8_t - int16_t - int32_t - int64_t - float32_t - float64_t - object - @cython.wraparound(False) @cython.boundscheck(False) -def unstack(reshape_t[:, :] values, const uint8_t[:] mask, +def unstack(numeric_object_t[:, :] values, const uint8_t[:] mask, Py_ssize_t stride, Py_ssize_t length, Py_ssize_t width, - reshape_t[:, :] new_values, uint8_t[:, :] new_mask) -> None: + numeric_object_t[:, :] new_values, uint8_t[:, :] new_mask) -> None: """ Transform long values to wide new_values. @@ -60,7 +40,7 @@ def unstack(reshape_t[:, :] values, const uint8_t[:] mask, cdef: Py_ssize_t i, j, w, nulls, s, offset - if reshape_t is not object: + if numeric_object_t is not object: # evaluated at compile-time with nogil: for i in range(stride): diff --git a/pandas/_libs/util.pxd b/pandas/_libs/util.pxd index be22fc368c28f..df88c896ac593 100644 --- a/pandas/_libs/util.pxd +++ b/pandas/_libs/util.pxd @@ -16,18 +16,3 @@ cdef extern from "src/headers/stdint.h": enum: INT32_MIN enum: INT64_MAX enum: INT64_MIN - - -ctypedef fused numeric: - cnp.int8_t - cnp.int16_t - cnp.int32_t - cnp.int64_t - - cnp.uint8_t - cnp.uint16_t - cnp.uint32_t - cnp.uint64_t - - cnp.float32_t - cnp.float64_t diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index ea52bd24a3689..29fe20090875b 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -20,15 +20,14 @@ from numpy cimport ( cnp.import_array() -cdef extern from "src/headers/cmath" namespace "std": +cdef extern from "../src/headers/cmath" namespace "std": bint isnan(float64_t) nogil bint notnan(float64_t) nogil int signbit(float64_t) nogil float64_t sqrt(float64_t x) nogil from pandas._libs.algos import is_monotonic - -from pandas._libs.util cimport numeric +from pandas._libs.dtypes cimport numeric_t cdef extern from "../src/skiplist.h": @@ -851,18 +850,18 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, # https://github.com/pydata/bottleneck -cdef inline numeric init_mm(numeric ai, Py_ssize_t *nobs, bint is_max) nogil: +cdef inline numeric_t init_mm(numeric_t ai, Py_ssize_t *nobs, bint is_max) nogil: - if numeric in cython.floating: + if numeric_t in cython.floating: if ai == ai: nobs[0] = nobs[0] + 1 elif is_max: - if numeric == cython.float: + if numeric_t == cython.float: ai = MINfloat32 else: ai = MINfloat64 else: - if numeric == cython.float: + if numeric_t == cython.float: ai = MAXfloat32 else: ai = MAXfloat64 @@ -873,18 +872,18 @@ cdef inline numeric init_mm(numeric ai, Py_ssize_t *nobs, bint is_max) nogil: return ai -cdef inline void remove_mm(numeric aold, Py_ssize_t *nobs) nogil: +cdef inline void remove_mm(numeric_t aold, Py_ssize_t *nobs) nogil: """ remove a value from the mm calc """ - if numeric in cython.floating and aold == aold: + if numeric_t in cython.floating and aold == aold: nobs[0] = nobs[0] - 1 -cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, - numeric value) nogil: +cdef inline numeric_t calc_mm(int64_t minp, Py_ssize_t nobs, + numeric_t value) nogil: cdef: - numeric result + numeric_t result - if numeric in cython.floating: + if numeric_t in cython.floating: if nobs >= minp: result = value else: @@ -940,13 +939,13 @@ def roll_min(ndarray[float64_t] values, ndarray[int64_t] start, return _roll_min_max(values, start, end, minp, is_max=0) -cdef _roll_min_max(ndarray[numeric] values, +cdef _roll_min_max(ndarray[numeric_t] values, ndarray[int64_t] starti, ndarray[int64_t] endi, int64_t minp, bint is_max): cdef: - numeric ai + numeric_t ai int64_t curr_win_size, start Py_ssize_t i, k, nobs = 0, N = len(values) deque Q[int64_t] # min/max always the front