diff --git a/ci/deps/actions-37-db-min.yaml b/ci/deps/actions-37-db-min.yaml index 1d3794576220a..0dfc806f4b631 100644 --- a/ci/deps/actions-37-db-min.yaml +++ b/ci/deps/actions-37-db-min.yaml @@ -27,6 +27,7 @@ dependencies: - lxml=4.3.0 - matplotlib - nomkl + - numba - numexpr - openpyxl - pandas-gbq diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml index 5381caaa242cf..75f934ec4690e 100644 --- a/ci/deps/actions-37-db.yaml +++ b/ci/deps/actions-37-db.yaml @@ -24,6 +24,7 @@ dependencies: - moto>=1.3.14 - flask - nomkl + - numba - numexpr - numpy=1.16.* - odfpy diff --git a/ci/deps/actions-37-locale_slow.yaml b/ci/deps/actions-37-locale_slow.yaml index d9ad1f538908e..5433f01b9bc35 100644 --- a/ci/deps/actions-37-locale_slow.yaml +++ b/ci/deps/actions-37-locale_slow.yaml @@ -17,6 +17,7 @@ dependencies: - bottleneck=1.2.* - lxml - matplotlib=3.0.0 + - numba - numpy=1.16.* - openpyxl=3.0.0 - python-dateutil diff --git a/ci/deps/actions-37.yaml b/ci/deps/actions-37.yaml index 61f431256dd4a..430e6fabbe237 100644 --- a/ci/deps/actions-37.yaml +++ b/ci/deps/actions-37.yaml @@ -15,6 +15,7 @@ dependencies: # pandas dependencies - botocore>=1.11 - fsspec>=0.7.4 + - numba - numpy - python-dateutil - nomkl diff --git a/ci/deps/actions-38-locale.yaml b/ci/deps/actions-38-locale.yaml index 629804c71e726..1d22ad1c1b18c 100644 --- a/ci/deps/actions-38-locale.yaml +++ b/ci/deps/actions-38-locale.yaml @@ -23,6 +23,7 @@ dependencies: - matplotlib <3.3.0 - moto - nomkl + - numba - numexpr - numpy<1.20 # GH#39541 compat with pyarrow<3 - openpyxl diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index e2660d07c3558..2607a943c434b 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -13,6 +13,7 @@ dependencies: - hypothesis>=3.58.0 # pandas dependencies + - numba - numpy - python-dateutil - nomkl diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 36e8bf528fc3e..d6fda472c5a32 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -12,6 +12,7 @@ dependencies: - hypothesis>=3.58.0 # pandas dependencies + - numba - numpy - python-dateutil - pytz diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml index d667adddda859..e0f6088e9f093 100644 --- a/ci/deps/azure-macos-37.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -18,6 +18,7 @@ dependencies: - lxml - matplotlib=2.2.3 - nomkl + - numba - numexpr - numpy=1.16.5 - openpyxl diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index e7ac4c783b855..a257fa6d27ae4 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -23,6 +23,7 @@ dependencies: - matplotlib=2.2.* - moto>=1.3.14 - flask + - numba - numexpr - numpy=1.16.* - openpyxl diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/travis-37-arm64.yaml index 8df6104f43a50..3ea1bc89af523 100644 --- a/ci/deps/travis-37-arm64.yaml +++ b/ci/deps/travis-37-arm64.yaml @@ -12,6 +12,7 @@ dependencies: # pandas dependencies - botocore>=1.11 + - numba - numpy - python-dateutil - pytz diff --git a/pandas/_libs_numba/__init__.py b/pandas/_libs_numba/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py new file mode 100644 index 0000000000000..b191131c2e914 --- /dev/null +++ b/pandas/_libs_numba/algos.py @@ -0,0 +1,1593 @@ +from __future__ import annotations + +import numba + +# from numba import ( +# float32, +# float64, +# int8, +# int16, +# int32, +# int64, +# intp, +# types, +# uint8, +# void, +# ) +import numpy as np + +import pandas._libs_numba.util as util + +# import cython +# from cython import Py_ssize_t + +# from libc.math cimport fabs, sqrt +# from libc.stdlib cimport free, malloc +# from libc.string cimport memmove + +# cimport numpy as cnp +# from numpy cimport ( +# NPY_FLOAT32, +# NPY_FLOAT64, +# NPY_INT8, +# NPY_INT16, +# NPY_INT32, +# NPY_INT64, +# NPY_OBJECT, +# NPY_UINT8, +# NPY_UINT16, +# NPY_UINT32, +# NPY_UINT64, +# float32_t, +# float64_t, +# int8_t, +# int16_t, +# int32_t, +# int64_t, +# intp_t, +# ndarray, +# uint8_t, +# uint16_t, +# uint32_t, +# uint64_t, +# ) + +# cnp.import_array() + + +# from pandas._libs.khash cimport ( +# kh_destroy_int64, +# kh_get_int64, +# kh_init_int64, +# kh_int64_t, +# kh_put_int64, +# kh_resize_int64, +# khiter_t, +# ) +# from pandas._libs.util cimport get_nat, numeric + +# import pandas._libs.missing as missing + +# cdef: +# float64_t FP_ERR = 1e-13 +# float64_t NaN = np.NaN +# int64_t NPY_NAT = get_nat() + +# tiebreakers = { +# "average": TIEBREAK_AVERAGE, +# "min": TIEBREAK_MIN, +# "max": TIEBREAK_MAX, +# "first": TIEBREAK_FIRST, +# "dense": TIEBREAK_DENSE, +# } + + +# cdef inline bint are_diff(object left, object right): +# try: +# return fabs(left - right) > FP_ERR +# except TypeError: +# return left != right + + +# class Infinity: +# """ +# Provide a positive Infinity comparison method for ranking. +# """ +# __lt__ = lambda self, other: False +# __le__ = lambda self, other: isinstance(other, Infinity) +# __eq__ = lambda self, other: isinstance(other, Infinity) +# __ne__ = lambda self, other: not isinstance(other, Infinity) +# __gt__ = lambda self, other: (not isinstance(other, Infinity) and +# not missing.checknull(other)) +# __ge__ = lambda self, other: not missing.checknull(other) + + +# class NegInfinity: +# """ +# Provide a negative Infinity comparison method for ranking. +# """ +# __lt__ = lambda self, other: (not isinstance(other, NegInfinity) and +# not missing.checknull(other)) +# __le__ = lambda self, other: not missing.checknull(other) +# __eq__ = lambda self, other: isinstance(other, NegInfinity) +# __ne__ = lambda self, other: not isinstance(other, NegInfinity) +# __gt__ = lambda self, other: False +# __ge__ = lambda self, other: isinstance(other, NegInfinity) + + +@numba.njit +def unique_deltas(arr: np.ndarray) -> np.ndarray: + """ + Efficiently find the unique first-differences of the given array. + + Parameters + ---------- + arr : ndarray[in64_t] + + Returns + ------- + ndarray[int64_t, ndim=1] + An ordered ndarray[int64_t] + """ + n = len(arr) + uniques = [] + seen = set() + + for i in range(n - 1): + val = arr[i + 1] - arr[i] + if val not in seen: + seen.add(val) + uniques.append(val) + + result = np.array(uniques, dtype=np.int64) + result.sort() + return result + + +def is_lexsorted(list_of_arrays: list[np.ndarray]) -> bool: + nlevels = len(list_of_arrays) + n = len(list_of_arrays[0]) + arr = np.concatenate(list_of_arrays) + arr = arr.reshape(nlevels, n) + return _is_lexsorted(arr) + + +@numba.njit +def _is_lexsorted(vecs: np.ndarray) -> bool: + result = True + nlevels, n = vecs.shape + + for i in range(1, n): + for k in range(nlevels): + cur = vecs[k, i] + pre = vecs[k, i - 1] + if cur == pre: + continue + elif cur > pre: + break + else: + result = False + break + + return result + + +@numba.njit +def groupsort_indexer(index: np.ndarray, ngroups: int) -> tuple[np.ndarray, np.ndarray]: + """ + Compute a 1-d indexer. + + The indexer is an ordering of the passed index, + ordered by the groups. + + Parameters + ---------- + index: ndarray + Mappings from group -> position. + ngroups: int + Number of groups. + + Returns + ------- + tuple + 1-d indexer ordered by groups, group counts. + + Notes + ----- + This is a reverse of the label factorization process. + """ + counts = np.zeros(ngroups + 1, dtype=np.int64) + n = len(index) + result = np.zeros(n, dtype=np.int64) + where = np.zeros(ngroups + 1, dtype=np.int64) + + # count group sizes, location 0 for NA + for i in range(n): + counts[index[i] + 1] += 1 + + # mark the start of each contiguous group of like-indexed data + for i in range(1, ngroups + 1): + where[i] = where[i - 1] + counts[i - 1] + + # this is our indexer + for i in range(n): + label = index[i] + 1 + result[where[label]] = i + where[label] += 1 + + return result, counts + + +@numba.njit +def kth_smallest(a: np.ndarray, k): + n = a.shape[0] + + l = 0 + m = n - 1 + + while l < m: + x = a[k] + i = l + j = m + + while 1: + while a[i] < x: + i += 1 + while x < a[j]: + j -= 1 + if i <= j: + a[i], a[j] = a[j], a[i] + i += 1 + j -= 1 + + if i > j: + break + + if j < k: + l = i + if k < i: + m = j + return a[k] + + +# # ---------------------------------------------------------------------- +# # Pairwise correlation/covariance + + +# @cython.boundscheck(False) +# @cython.wraparound(False) +# def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): +# cdef: +# Py_ssize_t i, j, xi, yi, N, K +# bint minpv +# ndarray[float64_t, ndim=2] result +# ndarray[uint8_t, ndim=2] mask +# int64_t nobs = 0 +# float64_t vx, vy, meanx, meany, divisor, prev_meany, prev_meanx, ssqdmx +# float64_t ssqdmy, covxy + +# N, K = (mat).shape + +# if minp is None: +# minpv = 1 +# else: +# minpv = minp + +# result = np.empty((K, K), dtype=np.float64) +# mask = np.isfinite(mat).view(np.uint8) + +# with nogil: +# for xi in range(K): +# for yi in range(xi + 1): +# # Welford's method for the variance-calculation +# # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance +# nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0 +# for i in range(N): +# if mask[i, xi] and mask[i, yi]: +# vx = mat[i, xi] +# vy = mat[i, yi] +# nobs += 1 +# prev_meanx = meanx +# prev_meany = meany +# meanx = meanx + 1 / nobs * (vx - meanx) +# meany = meany + 1 / nobs * (vy - meany) +# ssqdmx = ssqdmx + (vx - meanx) * (vx - prev_meanx) +# ssqdmy = ssqdmy + (vy - meany) * (vy - prev_meany) +# covxy = covxy + (vx - meanx) * (vy - prev_meany) + +# if nobs < minpv: +# result[xi, yi] = result[yi, xi] = NaN +# else: +# divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy) + +# if divisor != 0: +# result[xi, yi] = result[yi, xi] = covxy / divisor +# else: +# result[xi, yi] = result[yi, xi] = NaN + +# return result + +# # ---------------------------------------------------------------------- +# # Pairwise Spearman correlation + + +# @cython.boundscheck(False) +# @cython.wraparound(False) +# def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray: +# cdef: +# Py_ssize_t i, j, xi, yi, N, K +# ndarray[float64_t, ndim=2] result +# ndarray[float64_t, ndim=2] ranked_mat +# ndarray[float64_t, ndim=1] maskedx +# ndarray[float64_t, ndim=1] maskedy +# ndarray[uint8_t, ndim=2] mask +# int64_t nobs = 0 +# float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor +# const int64_t[:] labels_n, labels_nobs + +# N, K = (mat).shape +# # For compatibility when calling rank_1d +# labels_n = np.zeros(N, dtype=np.int64) + +# result = np.empty((K, K), dtype=np.float64) +# mask = np.isfinite(mat).view(np.uint8) + +# ranked_mat = np.empty((N, K), dtype=np.float64) + +# for i in range(K): +# ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n) + +# for xi in range(K): +# for yi in range(xi + 1): +# nobs = 0 +# # Keep track of whether we need to recompute ranks +# all_ranks = True +# for i in range(N): +# all_ranks &= not (mask[i, xi] ^ mask[i, yi]) +# if mask[i, xi] and mask[i, yi]: +# nobs += 1 + +# if nobs < minp: +# result[xi, yi] = result[yi, xi] = NaN +# else: +# maskedx = np.empty(nobs, dtype=np.float64) +# maskedy = np.empty(nobs, dtype=np.float64) +# j = 0 + +# for i in range(N): +# if mask[i, xi] and mask[i, yi]: +# maskedx[j] = ranked_mat[i, xi] +# maskedy[j] = ranked_mat[i, yi] +# j += 1 + +# if not all_ranks: +# labels_nobs = np.zeros(nobs, dtype=np.int64) +# maskedx = rank_1d(maskedx, labels=labels_nobs) +# maskedy = rank_1d(maskedy, labels=labels_nobs) + +# mean = (nobs + 1) / 2. + +# # now the cov numerator +# sumx = sumxx = sumyy = 0 + +# for i in range(nobs): +# vx = maskedx[i] - mean +# vy = maskedy[i] - mean + +# sumx += vx * vy +# sumxx += vx * vx +# sumyy += vy * vy + +# divisor = sqrt(sumxx * sumyy) + +# if divisor != 0: +# result[xi, yi] = result[yi, xi] = sumx / divisor +# else: +# result[xi, yi] = result[yi, xi] = NaN + +# return result + + +# # ---------------------------------------------------------------------- +# # Kendall correlation +# # Wikipedia article: https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient # noqa + +# @cython.boundscheck(False) +# @cython.wraparound(False) +# def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray: +# """ +# Perform kendall correlation on a 2d array + +# Parameters +# ---------- +# mat : np.ndarray[float64_t, ndim=2] +# Array to compute kendall correlation on +# minp : int, default 1 +# Minimum number of observations required per pair of columns +# to have a valid result. + +# Returns +# ------- +# numpy.ndarray[float64_t, ndim=2] +# Correlation matrix +# """ +# cdef: +# Py_ssize_t i, j, k, xi, yi, N, K +# ndarray[float64_t, ndim=2] result +# ndarray[float64_t, ndim=2] ranked_mat +# ndarray[uint8_t, ndim=2] mask +# float64_t currj +# ndarray[uint8_t, ndim=1] valid +# ndarray[int64_t] sorted_idxs +# ndarray[float64_t, ndim=1] col +# int64_t n_concordant +# int64_t total_concordant = 0 +# int64_t total_discordant = 0 +# float64_t kendall_tau +# int64_t n_obs +# const int64_t[:] labels_n + +# N, K = (mat).shape + +# result = np.empty((K, K), dtype=np.float64) +# mask = np.isfinite(mat) + +# ranked_mat = np.empty((N, K), dtype=np.float64) +# # For compatibility when calling rank_1d +# labels_n = np.zeros(N, dtype=np.int64) + +# for i in range(K): +# ranked_mat[:, i] = rank_1d(mat[:, i], labels_n) + +# for xi in range(K): +# sorted_idxs = ranked_mat[:, xi].argsort() +# ranked_mat = ranked_mat[sorted_idxs] +# mask = mask[sorted_idxs] +# for yi in range(xi + 1, K): +# valid = mask[:, xi] & mask[:, yi] +# if valid.sum() < minp: +# result[xi, yi] = NaN +# result[yi, xi] = NaN +# else: +# # Get columns and order second column using 1st column ranks +# if not valid.all(): +# col = ranked_mat[valid.nonzero()][:, yi] +# else: +# col = ranked_mat[:, yi] +# n_obs = col.shape[0] +# total_concordant = 0 +# total_discordant = 0 +# for j in range(n_obs - 1): +# currj = col[j] +# # Count num concordant and discordant pairs +# n_concordant = 0 +# for k in range(j, n_obs): +# if col[k] > currj: +# n_concordant += 1 +# total_concordant += n_concordant +# total_discordant += (n_obs - 1 - j - n_concordant) +# # Note: we do total_concordant+total_discordant here which is +# # equivalent to the C(n, 2), the total # of pairs, +# # listed on wikipedia +# kendall_tau = (total_concordant - total_discordant) / \ +# (total_concordant + total_discordant) +# result[xi, yi] = kendall_tau +# result[yi, xi] = kendall_tau + +# if mask[:, xi].sum() > minp: +# result[xi, xi] = 1 +# else: +# result[xi, xi] = NaN + +# return result + + +# ---------------------------------------------------------------------- + + +def validate_limit(limit: int | None = None) -> None: + """ + Check that the `limit` argument is a positive integer or None. + """ + if limit is None: + return + elif not util.is_integer_object(limit): + raise ValueError("Limit must be an integer") + elif limit < 1: + raise ValueError("Limit must be greater than 0") + + +# @cython.boundscheck(False) +# @cython.wraparound(False) +# def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): +# cdef: +# Py_ssize_t i, j, nleft, nright +# ndarray[int64_t, ndim=1] indexer +# algos_t cur, next_val +# int lim, fill_count = 0 + +# nleft = len(old) +# nright = len(new) +# indexer = np.empty(nright, dtype=np.int64) +# indexer[:] = -1 + +# lim = validate_limit(nright, limit) + +# if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: +# return indexer + +# i = j = 0 + +# cur = old[0] + +# while j <= nright - 1 and new[j] < cur: +# j += 1 + +# while True: +# if j == nright: +# break + +# if i == nleft - 1: +# while j < nright: +# if new[j] == cur: +# indexer[j] = i +# elif new[j] > cur and fill_count < lim: +# indexer[j] = i +# fill_count += 1 +# j += 1 +# break + +# next_val = old[i + 1] + +# while j < nright and cur <= new[j] < next_val: +# if new[j] == cur: +# indexer[j] = i +# elif fill_count < lim: +# indexer[j] = i +# fill_count += 1 +# j += 1 + +# fill_count = 0 +# i += 1 +# cur = next_val + +# return indexer + + +def pad_inplace(values: np.ndarray, mask: np.ndarray, limit: int | None = None) -> None: + validate_limit(limit) + _pad_inplace(values, mask, limit) + + +@numba.jit +def _pad_inplace( + values: np.ndarray, mask: np.ndarray, limit: int | None = None +) -> None: + if values.shape[0]: + if limit is None: + _pad_inplace_no_limit(values, mask) + else: + _pad_inplace_with_limit(values, mask, limit) + + +@numba.jit +def _pad_inplace_no_limit(values: np.ndarray, mask: np.ndarray) -> None: + N = len(values) + val, prev_mask = values[0], mask[0] + for i in range(N): + if mask[i]: + values[i], mask[i] = val, prev_mask + else: + val, prev_mask = values[i], mask[i] + + +@numba.jit +def _pad_inplace_with_limit(values: np.ndarray, mask: np.ndarray, limit: int) -> None: + N = len(values) + fill_count = 0 + val, prev_mask = values[0], mask[0] + for i in range(N): + if mask[i]: + if fill_count >= limit: + continue + fill_count += 1 + values[i], mask[i] = val, prev_mask + + else: + fill_count = 0 + val, prev_mask = values[i], mask[i] + + +def pad_2d_inplace( + values: np.ndarray, mask: np.ndarray, limit: int | None = None +) -> None: + validate_limit(limit) + _pad_2d_inplace(values, mask, limit) + + +@numba.jit +def _pad_2d_inplace(values, mask, limit=None): + if values.shape[1]: + if limit is None: + _pad_2d_inplace_no_limit(values, mask) + else: + _pad_2d_inplace_with_limit(values, mask, limit) + + +@numba.jit +def _pad_2d_inplace_no_limit(values, mask): + K, N = values.shape + for j in range(K): + val = values[j, 0] + for i in range(N): + if mask[j, i]: + values[j, i] = val + else: + val = values[j, i] + + +@numba.jit +def _pad_2d_inplace_with_limit(values, mask, limit): + K, N = values.shape + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= limit: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +# """ +# Backfilling logic for generating fill vector + +# Diagram of what's going on + +# Old New Fill vector Mask +# . 0 1 +# . 0 1 +# . 0 1 +# A A 0 1 +# . 1 1 +# . 1 1 +# . 1 1 +# . 1 1 +# . 1 1 +# B B 1 1 +# . 2 1 +# . 2 1 +# . 2 1 +# C C 2 1 +# . 0 +# . 0 +# D +# """ + + +# @cython.boundscheck(False) +# @cython.wraparound(False) +# def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: +# cdef: +# Py_ssize_t i, j, nleft, nright +# ndarray[int64_t, ndim=1] indexer +# algos_t cur, prev +# int lim, fill_count = 0 + +# nleft = len(old) +# nright = len(new) +# indexer = np.empty(nright, dtype=np.int64) +# indexer[:] = -1 + +# lim = validate_limit(nright, limit) + +# if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: +# return indexer + +# i = nleft - 1 +# j = nright - 1 + +# cur = old[nleft - 1] + +# while j >= 0 and new[j] > cur: +# j -= 1 + +# while True: +# if j < 0: +# break + +# if i == 0: +# while j >= 0: +# if new[j] == cur: +# indexer[j] = i +# elif new[j] < cur and fill_count < lim: +# indexer[j] = i +# fill_count += 1 +# j -= 1 +# break + +# prev = old[i - 1] + +# while j >= 0 and prev < new[j] <= cur: +# if new[j] == cur: +# indexer[j] = i +# elif new[j] < cur and fill_count < lim: +# indexer[j] = i +# fill_count += 1 +# j -= 1 + +# fill_count = 0 +# i -= 1 +# cur = prev + +# return indexer + + +@numba.njit +def is_monotonic(arr: np.ndarray) -> tuple[bool, bool, bool]: + """ + Returns + ------- + tuple + is_monotonic_inc : bool + is_monotonic_dec : bool + is_unique : bool + """ + is_monotonic_inc = True + is_monotonic_dec = True + is_unique = True + is_strict_monotonic = True + + n = len(arr) + + if n == 1: + if arr[0] != arr[0]: + # single value is NaN/NaT + return False, False, True + else: + return True, True, True + elif n < 2: + return True, True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + is_monotonic_inc = False + elif cur > prev: + is_monotonic_dec = False + elif cur == prev: + is_unique = False + else: + # cur or prev is NaN/NaT + is_monotonic_inc = False + is_monotonic_dec = False + break + if not is_monotonic_inc and not is_monotonic_dec: + break + prev = cur + + is_strict_monotonic = is_unique and (is_monotonic_inc or is_monotonic_dec) + return is_monotonic_inc, is_monotonic_dec, is_strict_monotonic + + +# # ---------------------------------------------------------------------- +# # rank_1d, rank_2d +# # ---------------------------------------------------------------------- + +# ctypedef fused rank_t: +# object +# float64_t +# uint64_t +# int64_t + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +# def rank_1d( +# ndarray[rank_t, ndim=1] values, +# const int64_t[:] labels, +# ties_method="average", +# bint ascending=True, +# bint pct=False, +# na_option="keep", +# ): +# """ +# Fast NaN-friendly version of ``scipy.stats.rankdata``. + +# Parameters +# ---------- +# values : array of rank_t values to be ranked +# labels : array containing unique label for each group, with its ordering +# matching up to the corresponding record in `values`. If not called +# from a groupby operation, will be an array of 0's +# ties_method : {'average', 'min', 'max', 'first', 'dense'}, default +# 'average' +# * average: average rank of group +# * min: lowest rank in group +# * max: highest rank in group +# * first: ranks assigned in order they appear in the array +# * dense: like 'min', but rank always increases by 1 between groups +# ascending : boolean, default True +# False for ranks by high (1) to low (N) +# na_option : {'keep', 'top', 'bottom'}, default 'keep' +# pct : boolean, default False +# Compute percentage rank of data within each group +# na_option : {'keep', 'top', 'bottom'}, default 'keep' +# * keep: leave NA values where they are +# * top: smallest rank if ascending +# * bottom: smallest rank if descending +# """ +# cdef: +# TiebreakEnumType tiebreak +# Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0 +# Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 +# ndarray[int64_t, ndim=1] lexsort_indexer +# ndarray[float64_t, ndim=1] grp_sizes, out +# ndarray[rank_t, ndim=1] masked_vals +# ndarray[uint8_t, ndim=1] mask +# bint keep_na, at_end, next_val_diff, check_labels +# rank_t nan_fill_val + +# tiebreak = tiebreakers[ties_method] +# keep_na = na_option == 'keep' + +# N = len(values) +# # TODO Cython 3.0: cast won't be necessary (#2992) +# assert len(labels) == N +# out = np.empty(N) +# grp_sizes = np.ones(N) +# # If all 0 labels, can short-circuit later label +# # comparisons +# check_labels = np.any(labels) + +# # Copy values into new array in order to fill missing data +# # with mask, without obfuscating location of missing data +# # in values array +# if rank_t is object and values.dtype != np.object_: +# masked_vals = values.astype('O') +# else: +# masked_vals = values.copy() + +# if rank_t is object: +# mask = missing.isnaobj(masked_vals) +# elif rank_t is int64_t: +# mask = (masked_vals == NPY_NAT).astype(np.uint8) +# elif rank_t is float64_t: +# mask = np.isnan(masked_vals).astype(np.uint8) +# else: +# mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) + +# if ascending ^ (na_option == 'top'): +# if rank_t is object: +# nan_fill_val = Infinity() +# elif rank_t is int64_t: +# nan_fill_val = np.iinfo(np.int64).max +# elif rank_t is uint64_t: +# nan_fill_val = np.iinfo(np.uint64).max +# else: +# nan_fill_val = np.inf +# order = (masked_vals, mask, labels) +# else: +# if rank_t is object: +# nan_fill_val = NegInfinity() +# elif rank_t is int64_t: +# nan_fill_val = np.iinfo(np.int64).min +# elif rank_t is uint64_t: +# nan_fill_val = 0 +# else: +# nan_fill_val = -np.inf + +# order = (masked_vals, ~mask, labels) + +# np.putmask(masked_vals, mask, nan_fill_val) + +# # lexsort using labels, then mask, then actual values +# # each label corresponds to a different group value, +# # the mask helps you differentiate missing values before +# # performing sort on the actual values +# lexsort_indexer = np.lexsort(order).astype(np.int64, copy=False) + +# if not ascending: +# lexsort_indexer = lexsort_indexer[::-1] + +# # Loop over the length of the value array +# # each incremental i value can be looked up in the lexsort_indexer +# # array that we sorted previously, which gives us the location of +# # that sorted value for retrieval back from the original +# # values / masked_vals arrays +# # TODO: de-duplicate once cython supports conditional nogil +# if rank_t is object: +# for i in range(N): +# at_end = i == N - 1 +# # dups and sum_ranks will be incremented each loop where +# # the value / group remains the same, and should be reset +# # when either of those change +# # Used to calculate tiebreakers +# dups += 1 +# sum_ranks += i - grp_start + 1 + +# # Update out only when there is a transition of values or labels. +# # When a new value or group is encountered, go back #dups steps( +# # the number of occurrence of current value) and assign the ranks +# # based on the starting index of the current group (grp_start) +# # and the current index +# if not at_end: +# next_val_diff = are_diff(masked_vals[lexsort_indexer[i]], +# masked_vals[lexsort_indexer[i+1]]) +# else: +# next_val_diff = True + +# if (next_val_diff +# or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) +# or (check_labels +# and (labels[lexsort_indexer[i]] +# != labels[lexsort_indexer[i+1]])) +# ): +# # if keep_na, check for missing values and assign back +# # to the result where appropriate +# if keep_na and mask[lexsort_indexer[i]]: +# for j in range(i - dups + 1, i + 1): +# out[lexsort_indexer[j]] = NaN +# grp_na_count = dups +# elif tiebreak == TIEBREAK_AVERAGE: +# for j in range(i - dups + 1, i + 1): +# out[lexsort_indexer[j]] = sum_ranks / dups +# elif tiebreak == TIEBREAK_MIN: +# for j in range(i - dups + 1, i + 1): +# out[lexsort_indexer[j]] = i - grp_start - dups + 2 +# elif tiebreak == TIEBREAK_MAX: +# for j in range(i - dups + 1, i + 1): +# out[lexsort_indexer[j]] = i - grp_start + 1 +# elif tiebreak == TIEBREAK_FIRST: +# for j in range(i - dups + 1, i + 1): +# if ascending: +# out[lexsort_indexer[j]] = j + 1 - grp_start +# else: +# out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start +# elif tiebreak == TIEBREAK_DENSE: +# for j in range(i - dups + 1, i + 1): +# out[lexsort_indexer[j]] = grp_vals_seen + +# # look forward to the next value (using the sorting in _as) +# # if the value does not equal the current value then we need to +# # reset the dups and sum_ranks, knowing that a new value is +# # coming up. the conditional also needs to handle nan equality +# # and the end of iteration +# if next_val_diff or (mask[lexsort_indexer[i]] +# ^ mask[lexsort_indexer[i+1]]): +# dups = sum_ranks = 0 +# grp_vals_seen += 1 +# grp_tie_count += 1 + +# # Similar to the previous conditional, check now if we are +# # moving to a new group. If so, keep track of the index where +# # the new group occurs, so the tiebreaker calculations can +# # decrement that from their position. fill in the size of each +# # group encountered (used by pct calculations later). also be +# # sure to reset any of the items helping to calculate dups +# if (at_end or +# (check_labels +# and (labels[lexsort_indexer[i]] +# != labels[lexsort_indexer[i+1]]))): +# if tiebreak != TIEBREAK_DENSE: +# for j in range(grp_start, i + 1): +# grp_sizes[lexsort_indexer[j]] = \ +# (i - grp_start + 1 - grp_na_count) +# else: +# for j in range(grp_start, i + 1): +# grp_sizes[lexsort_indexer[j]] = \ +# (grp_tie_count - (grp_na_count > 0)) +# dups = sum_ranks = 0 +# grp_na_count = 0 +# grp_tie_count = 0 +# grp_start = i + 1 +# grp_vals_seen = 1 +# else: +# with nogil: +# for i in range(N): +# at_end = i == N - 1 +# # dups and sum_ranks will be incremented each loop where +# # the value / group remains the same, and should be reset +# # when either of those change +# # Used to calculate tiebreakers +# dups += 1 +# sum_ranks += i - grp_start + 1 + +# # Update out only when there is a transition of values or labels. +# # When a new value or group is encountered, go back #dups steps( +# # the number of occurrence of current value) and assign the ranks +# # based on the starting index of the current group (grp_start) +# # and the current index +# if not at_end: +# next_val_diff = (masked_vals[lexsort_indexer[i]] +# != masked_vals[lexsort_indexer[i+1]]) +# else: +# next_val_diff = True + +# if (next_val_diff +# or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) +# or (check_labels +# and (labels[lexsort_indexer[i]] +# != labels[lexsort_indexer[i+1]])) +# ): +# # if keep_na, check for missing values and assign back +# # to the result where appropriate +# if keep_na and mask[lexsort_indexer[i]]: +# for j in range(i - dups + 1, i + 1): +# out[lexsort_indexer[j]] = NaN +# grp_na_count = dups +# elif tiebreak == TIEBREAK_AVERAGE: +# for j in range(i - dups + 1, i + 1): +# out[lexsort_indexer[j]] = sum_ranks / dups +# elif tiebreak == TIEBREAK_MIN: +# for j in range(i - dups + 1, i + 1): +# out[lexsort_indexer[j]] = i - grp_start - dups + 2 +# elif tiebreak == TIEBREAK_MAX: +# for j in range(i - dups + 1, i + 1): +# out[lexsort_indexer[j]] = i - grp_start + 1 +# elif tiebreak == TIEBREAK_FIRST: +# for j in range(i - dups + 1, i + 1): +# if ascending: +# out[lexsort_indexer[j]] = j + 1 - grp_start +# else: +# out[lexsort_indexer[j]] = \ +# (2 * i - j - dups + 2 - grp_start) +# elif tiebreak == TIEBREAK_DENSE: +# for j in range(i - dups + 1, i + 1): +# out[lexsort_indexer[j]] = grp_vals_seen + +# # look forward to the next value (using the sorting in +# # lexsort_indexer) if the value does not equal the current +# # value then we need to reset the dups and sum_ranks, +# # knowing that a new value is coming up. the conditional +# # also needs to handle nan equality and the end of iteration +# if next_val_diff or (mask[lexsort_indexer[i]] +# ^ mask[lexsort_indexer[i+1]]): +# dups = sum_ranks = 0 +# grp_vals_seen += 1 +# grp_tie_count += 1 + +# # Similar to the previous conditional, check now if we are +# # moving to a new group. If so, keep track of the index where +# # the new group occurs, so the tiebreaker calculations can +# # decrement that from their position. fill in the size of each +# # group encountered (used by pct calculations later). also be +# # sure to reset any of the items helping to calculate dups +# if at_end or (check_labels and +# (labels[lexsort_indexer[i]] +# != labels[lexsort_indexer[i+1]])): +# if tiebreak != TIEBREAK_DENSE: +# for j in range(grp_start, i + 1): +# grp_sizes[lexsort_indexer[j]] = \ +# (i - grp_start + 1 - grp_na_count) +# else: +# for j in range(grp_start, i + 1): +# grp_sizes[lexsort_indexer[j]] = \ +# (grp_tie_count - (grp_na_count > 0)) +# dups = sum_ranks = 0 +# grp_na_count = 0 +# grp_tie_count = 0 +# grp_start = i + 1 +# grp_vals_seen = 1 + +# if pct: +# for i in range(N): +# if grp_sizes[i] != 0: +# out[i] = out[i] / grp_sizes[i] + +# return out + + +# def rank_2d( +# ndarray[rank_t, ndim=2] in_arr, +# int axis=0, +# ties_method="average", +# bint ascending=True, +# na_option="keep", +# bint pct=False, +# ): +# """ +# Fast NaN-friendly version of ``scipy.stats.rankdata``. +# """ +# cdef: +# Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 +# Py_ssize_t infs +# ndarray[float64_t, ndim=2] ranks +# ndarray[rank_t, ndim=2] values +# ndarray[intp_t, ndim=2] argsort_indexer +# ndarray[uint8_t, ndim=2] mask +# rank_t val, nan_value +# float64_t count, sum_ranks = 0.0 +# int tiebreak = 0 +# int64_t idx +# bint check_mask, condition, keep_na + +# tiebreak = tiebreakers[ties_method] + +# keep_na = na_option == 'keep' +# check_mask = rank_t is not uint64_t + +# if axis == 0: +# values = np.asarray(in_arr).T.copy() +# else: +# values = np.asarray(in_arr).copy() + +# if rank_t is object: +# if values.dtype != np.object_: +# values = values.astype('O') + +# if rank_t is not uint64_t: +# if ascending ^ (na_option == 'top'): +# if rank_t is object: +# nan_value = Infinity() +# elif rank_t is float64_t: +# nan_value = np.inf +# elif rank_t is int64_t: +# nan_value = np.iinfo(np.int64).max + +# else: +# if rank_t is object: +# nan_value = NegInfinity() +# elif rank_t is float64_t: +# nan_value = -np.inf +# elif rank_t is int64_t: +# nan_value = NPY_NAT + +# if rank_t is object: +# mask = missing.isnaobj2d(values) +# elif rank_t is float64_t: +# mask = np.isnan(values) +# elif rank_t is int64_t: +# mask = values == NPY_NAT + +# np.putmask(values, mask, nan_value) +# else: +# mask = np.zeros_like(values, dtype=bool) + +# n, k = (values).shape +# ranks = np.empty((n, k), dtype='f8') + +# if tiebreak == TIEBREAK_FIRST: +# # need to use a stable sort here +# argsort_indexer = values.argsort(axis=1, kind='mergesort') +# if not ascending: +# tiebreak = TIEBREAK_FIRST_DESCENDING +# else: +# argsort_indexer = values.argsort(1) + +# if not ascending: +# argsort_indexer = argsort_indexer[:, ::-1] + +# values = _take_2d(values, argsort_indexer) + +# for i in range(n): +# dups = sum_ranks = infs = 0 + +# total_tie_count = 0 +# count = 0.0 +# for j in range(k): +# val = values[i, j] +# idx = argsort_indexer[i, j] +# if keep_na and check_mask and mask[i, idx]: +# ranks[i, idx] = NaN +# infs += 1 +# continue + +# count += 1.0 + +# sum_ranks += (j - infs) + 1 +# dups += 1 + +# if rank_t is object: +# condition = ( +# j == k - 1 or +# are_diff(values[i, j + 1], val) or +# (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]]) +# ) +# else: +# condition = ( +# j == k - 1 or +# values[i, j + 1] != val or +# (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]]) +# ) + +# if condition: +# if tiebreak == TIEBREAK_AVERAGE: +# for z in range(j - dups + 1, j + 1): +# ranks[i, argsort_indexer[i, z]] = sum_ranks / dups +# elif tiebreak == TIEBREAK_MIN: +# for z in range(j - dups + 1, j + 1): +# ranks[i, argsort_indexer[i, z]] = j - dups + 2 +# elif tiebreak == TIEBREAK_MAX: +# for z in range(j - dups + 1, j + 1): +# ranks[i, argsort_indexer[i, z]] = j + 1 +# elif tiebreak == TIEBREAK_FIRST: +# if rank_t is object: +# raise ValueError('first not supported for non-numeric data') +# else: +# for z in range(j - dups + 1, j + 1): +# ranks[i, argsort_indexer[i, z]] = z + 1 +# elif tiebreak == TIEBREAK_FIRST_DESCENDING: +# for z in range(j - dups + 1, j + 1): +# ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2 +# elif tiebreak == TIEBREAK_DENSE: +# total_tie_count += 1 +# for z in range(j - dups + 1, j + 1): +# ranks[i, argsort_indexer[i, z]] = total_tie_count +# sum_ranks = dups = 0 +# if pct: +# if tiebreak == TIEBREAK_DENSE: +# ranks[i, :] /= total_tie_count +# else: +# ranks[i, :] /= count +# if axis == 0: +# return ranks.T +# else: +# return ranks + + +@numba.njit +def diff_2d( + arr: np.ndarray, + out: np.ndarray, + periods: int, + axis: int, +): + f_contig = arr.flags.f_contiguous + + sx, sy = arr.shape + if f_contig: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + left = arr[i, j] + right = arr[i - periods, j] + out[i, j] = left - right + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + left = arr[i, j] + right = arr[i, j - periods] + out[i, j] = left - right + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + left = arr[i, j] + right = arr[i - periods, j] + out[i, j] = left - right + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + left = arr[i, j] + right = arr[i, j - periods] + out[i, j] = left - right + + +# ---------------------------------------------------------------------- +# ensure_dtype +# ---------------------------------------------------------------------- + + +def ensure_platform_int(arr): + # GH3033, GH1392 + # platform int is the size of the int pointer, e.g. np.intp + if isinstance(arr, np.ndarray): + return arr.astype(np.intp, copy=False) + else: + return np.array(arr, dtype=np.intp) + + +def ensure_object(arr): + if isinstance(arr, np.ndarray): + return arr.astype(np.object_, copy=False) + else: + return np.array(arr, dtype=np.object_) + + +def ensure_float64(arr): + if isinstance(arr, np.ndarray): + return arr.astype(np.float64, copy=False) + else: + return np.array(arr, dtype=np.float64) + + +def ensure_float32(arr): + if isinstance(arr, np.ndarray): + return arr.astype(np.float32, copy=False) + else: + return np.array(arr, dtype=np.float32) + + +def ensure_int8(arr): + if isinstance(arr, np.ndarray): + return arr.astype(np.int8, copy=False) + else: + return np.array(arr, dtype=np.int8) + + +def ensure_int16(arr): + if isinstance(arr, np.ndarray): + return arr.astype(np.int16, copy=False) + else: + return np.array(arr, dtype=np.int16) + + +def ensure_int32(arr): + if isinstance(arr, np.ndarray): + return arr.astype(np.int32, copy=False) + else: + return np.array(arr, dtype=np.int32) + + +def ensure_int64(arr): + if isinstance(arr, np.ndarray): + return arr.astype(np.int64, copy=False) + else: + return np.array(arr, dtype=np.int64) + + +def ensure_uint8(arr): + if isinstance(arr, np.ndarray): + return arr.astype(np.uint8, copy=False) + else: + return np.array(arr, dtype=np.uint8) + + +def ensure_uint16(arr): + if isinstance(arr, np.ndarray): + return arr.astype(np.uint16, copy=False) + else: + return np.array(arr, dtype=np.uint16) + + +def ensure_uint32(arr): + if isinstance(arr, np.ndarray): + return arr.astype(np.uint32, copy=False) + else: + return np.array(arr, dtype=np.uint32) + + +def ensure_uint64(arr): + if isinstance(arr, np.ndarray): + return arr.astype(np.uint64, copy=False) + else: + return np.array(arr, dtype=np.uint64) + + +# ---------------------------------------------------------------------- +# take_1d, take_2d +# ---------------------------------------------------------------------- + + +def _take_1d_no_python( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + n = indexer.shape[0] + + func = _take_1d_parallel if n > 10_000 else _take_1d_serial + + func(values, indexer, out, fill_value, n) + + +def _take_1d_object( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + n = indexer.shape[0] + + _take_1d_serial_object(values, indexer, out, fill_value, n) + + +def _take_1d( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value, n: int +) -> None: + for i in numba.prange(n): + idx = indexer[i] + if idx == -1: + out[i] = fill_value + else: + out[i] = values[idx] + + +_take_1d_parallel = numba.njit(parallel=True)(_take_1d) +_take_1d_serial = numba.njit(_take_1d) +_take_1d_serial_object = numba.jit(forceobj=True)(_take_1d) + + +# @numba.njit(void(int8[:], intp[:], int8[:], int8)) +# @numba.njit +def take_1d_int8_int8( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.njit(void(int8[:], intp[:], int32[:], int32)) +# @numba.njit +def take_1d_int8_int32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.njit(void(int8[:], intp[:], int64[:], int64)) +# @numba.njit +def take_1d_int8_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.njit(void(int8[:], intp[:], float64[:], float64)) +# @numba.njit +def take_1d_int8_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.njit( +# void(types.Array(types.int16, 1, "C", readonly=True), intp[:], int16[:], int16) +# ) +# @numba.njit +def take_1d_int16_int16( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.njit(void(int16[:], intp[:], int32[:], int32)) +# @numba.njit +def take_1d_int16_int32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.njit(void(int16[:], intp[:], int64[:], int64)) +# @numba.njit +def take_1d_int16_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.njit(void(int16[:], intp[:], float64[:], float64)) +# @numba.njit +def take_1d_int16_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.njit(void(int32[:], intp[:], int32[:], int32)) +# @numba.njit +def take_1d_int32_int32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.njit(void(int32[:], intp[:], int64[:], int64)) +# @numba.njit +def take_1d_int32_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.njit(void(int32[:], intp[:], float64[:], float64)) +# @numba.njit +def take_1d_int32_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.njit(void(int64[:], intp[:], int64[:], int64)) +# @numba.njit +def take_1d_int64_int64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.njit(void(int64[:], intp[:], float64[:], float64)) +# @numba.njit +def take_1d_int64_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.njit(void(float32[:], intp[:], float32[:], float32)) +# @numba.njit +def take_1d_float32_float32( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.njit(void(float32[:], intp[:], float64[:], float64)) +# @numba.njit +def take_1d_float32_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.njit( +# [ +# void( +# types.Array(types.int64, 1, "C", readonly=True), +# intp[:], +# float64[:], +# float64, +# ), +# void( +# float64[:], +# intp[:], +# float64[:], +# float64, +# ), +# ] +# ) +# @numba.njit +def take_1d_float64_float64( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +# @numba.jit(forceobj=True) +def take_1d_object_object( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_object(values, indexer, out, fill_value) + + +# @numba.njit(void(uint8[:], intp[:], uint8[:], uint8)) +# @numba.njit +def take_1d_bool_bool( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + _take_1d_no_python(values, indexer, out, fill_value) + + +@numba.jit(forceobj=True) +def take_1d_bool_object( + values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan +) -> None: + n = indexer.shape[0] + + for i in range(n): + idx = indexer[i] + if idx == -1: + out[i] = fill_value + else: + out[i] = True if values[idx] > 0 else False + + +# # generated from template +# include "algos_take_helper.pxi" diff --git a/pandas/_libs_numba/missing.py b/pandas/_libs_numba/missing.py new file mode 100644 index 0000000000000..7cc3273222a3e --- /dev/null +++ b/pandas/_libs_numba/missing.py @@ -0,0 +1,553 @@ +# import numbers + +# import cython +# from cython import Py_ssize_t +# import numpy as np + +# cimport numpy as cnp +# from numpy cimport float64_t, int64_t, ndarray, uint8_t + +# cnp.import_array() + +# from pandas._libs cimport util +# from pandas._libs.tslibs.nattype cimport ( +# c_NaT as NaT, +# checknull_with_nat, +# is_null_datetimelike, +# ) +# from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value # noqa + +from decimal import Decimal + +# from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op +# from pandas.compat import IS64 +import numba + +from pandas._libs.missing import NA +from pandas._libs.tslibs import is_null_datetimelike + +# cdef: +# float64_t INF = np.inf +# float64_t NEGINF = -INF + +# int64_t NPY_NAT = util.get_nat() + +# bint is_32bit = not IS64 + + +# cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False): +# """ +# Check if two scalars are both NA of matching types. + +# Parameters +# ---------- +# left : Any +# right : Any +# nan_matches_none : bool, default False +# For backwards compatibility, consider NaN as matching None. + +# Returns +# ------- +# bool +# """ +# if left is None: +# if nan_matches_none and util.is_nan(right): +# return True +# return right is None +# elif left is C_NA: +# return right is C_NA +# elif left is NaT: +# return right is NaT +# elif util.is_float_object(left): +# if nan_matches_none and right is None: +# return True +# return ( +# util.is_nan(left) +# and util.is_float_object(right) +# and util.is_nan(right) +# ) +# elif util.is_complex_object(left): +# return ( +# util.is_nan(left) +# and util.is_complex_object(right) +# and util.is_nan(right) +# ) +# elif util.is_datetime64_object(left): +# return ( +# get_datetime64_value(left) == NPY_NAT +# and util.is_datetime64_object(right) +# and get_datetime64_value(right) == NPY_NAT +# ) +# elif util.is_timedelta64_object(left): +# return ( +# get_timedelta64_value(left) == NPY_NAT +# and util.is_timedelta64_object(right) +# and get_timedelta64_value(right) == NPY_NAT +# ) +# return False + + +@numba.jit(forceobj=True) +def checknull(val: object) -> bool: + """ + Return boolean describing of the input is NA-like, defined here as any + of: + - None + - nan + - NaT + - np.datetime64 representation of NaT + - np.timedelta64 representation of NaT + - NA + + Parameters + ---------- + val : object + + Returns + ------- + bool + + Notes + ----- + The difference between `checknull` and `checknull_old` is that `checknull` + does *not* consider INF or NEGINF to be NA. + """ + return ( + val is NA or is_null_datetimelike(val, inat_is_null=False) or is_decimal_na(val) + ) + + +def is_decimal_na(val: object) -> bool: + """ + Is this a decimal.Decimal object Decimal("NAN"). + """ + return isinstance(val, Decimal) and val != val + + +# cpdef bint checknull_old(object val): +# """ +# Return boolean describing of the input is NA-like, defined here as any +# of: +# - None +# - nan +# - INF +# - NEGINF +# - NaT +# - np.datetime64 representation of NaT +# - np.timedelta64 representation of NaT + +# Parameters +# ---------- +# val : object + +# Returns +# ------- +# result : bool + +# Notes +# ----- +# The difference between `checknull` and `checknull_old` is that `checknull` +# does *not* consider INF or NEGINF to be NA. +# """ +# if checknull(val): +# return True +# elif util.is_float_object(val) or util.is_complex_object(val): +# return val == INF or val == NEGINF +# return False + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +# cpdef ndarray[uint8_t] isnaobj(ndarray arr): +# """ +# Return boolean mask denoting which elements of a 1-D array are na-like, +# according to the criteria defined in `checknull`: +# - None +# - nan +# - NaT +# - np.datetime64 representation of NaT +# - np.timedelta64 representation of NaT + +# Parameters +# ---------- +# arr : ndarray + +# Returns +# ------- +# result : ndarray (dtype=np.bool_) +# """ +# cdef: +# Py_ssize_t i, n +# object val +# ndarray[uint8_t] result + +# assert arr.ndim == 1, "'arr' must be 1-D." + +# n = len(arr) +# result = np.empty(n, dtype=np.uint8) +# for i in range(n): +# val = arr[i] +# result[i] = checknull(val) +# return result.view(np.bool_) + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +# def isnaobj_old(arr: ndarray) -> ndarray: +# """ +# Return boolean mask denoting which elements of a 1-D array are na-like, +# defined as being any of: +# - None +# - nan +# - INF +# - NEGINF +# - NaT +# - NA + +# Parameters +# ---------- +# arr : ndarray + +# Returns +# ------- +# result : ndarray (dtype=np.bool_) +# """ +# cdef: +# Py_ssize_t i, n +# object val +# ndarray[uint8_t] result + +# assert arr.ndim == 1, "'arr' must be 1-D." + +# n = len(arr) +# result = np.zeros(n, dtype=np.uint8) +# for i in range(n): +# val = arr[i] +# result[i] = ( +# checknull(val) +# or util.is_float_object(val) and (val == INF or val == NEGINF) +# ) +# return result.view(np.bool_) + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +# def isnaobj2d(arr: ndarray) -> ndarray: +# """ +# Return boolean mask denoting which elements of a 2-D array are na-like, +# according to the criteria defined in `checknull`: +# - None +# - nan +# - NaT +# - np.datetime64 representation of NaT +# - np.timedelta64 representation of NaT + +# Parameters +# ---------- +# arr : ndarray + +# Returns +# ------- +# result : ndarray (dtype=np.bool_) + +# Notes +# ----- +# The difference between `isnaobj2d` and `isnaobj2d_old` is that `isnaobj2d` +# does *not* consider INF or NEGINF to be NA. +# """ +# cdef: +# Py_ssize_t i, j, n, m +# object val +# ndarray[uint8_t, ndim=2] result + +# assert arr.ndim == 2, "'arr' must be 2-D." + +# n, m = (arr).shape +# result = np.zeros((n, m), dtype=np.uint8) +# for i in range(n): +# for j in range(m): +# val = arr[i, j] +# if checknull(val): +# result[i, j] = 1 +# return result.view(np.bool_) + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +# def isnaobj2d_old(arr: ndarray) -> ndarray: +# """ +# Return boolean mask denoting which elements of a 2-D array are na-like, +# according to the criteria defined in `checknull_old`: +# - None +# - nan +# - INF +# - NEGINF +# - NaT +# - np.datetime64 representation of NaT +# - np.timedelta64 representation of NaT + +# Parameters +# ---------- +# arr : ndarray + +# Returns +# ------- +# ndarray (dtype=np.bool_) + +# Notes +# ----- +# The difference between `isnaobj2d` and `isnaobj2d_old` is that `isnaobj2d` +# does *not* consider INF or NEGINF to be NA. +# """ +# cdef: +# Py_ssize_t i, j, n, m +# object val +# ndarray[uint8_t, ndim=2] result + +# assert arr.ndim == 2, "'arr' must be 2-D." + +# n, m = (arr).shape +# result = np.zeros((n, m), dtype=np.uint8) +# for i in range(n): +# for j in range(m): +# val = arr[i, j] +# if checknull_old(val): +# result[i, j] = 1 +# return result.view(np.bool_) + + +# def isposinf_scalar(val: object) -> bool: +# return util.is_float_object(val) and val == INF + + +# def isneginf_scalar(val: object) -> bool: +# return util.is_float_object(val) and val == NEGINF + + +# cdef inline bint is_null_datetime64(v): +# # determine if we have a null for a datetime (or integer versions), +# # excluding np.timedelta64('nat') +# if checknull_with_nat(v): +# return True +# elif util.is_datetime64_object(v): +# return get_datetime64_value(v) == NPY_NAT +# return False + + +# cdef inline bint is_null_timedelta64(v): +# # determine if we have a null for a timedelta (or integer versions), +# # excluding np.datetime64('nat') +# if checknull_with_nat(v): +# return True +# elif util.is_timedelta64_object(v): +# return get_timedelta64_value(v) == NPY_NAT +# return False + + +# cdef bint checknull_with_nat_and_na(object obj): +# # See GH#32214 +# return checknull_with_nat(obj) or obj is C_NA + + +# # ----------------------------------------------------------------------------- +# # Implementation of NA singleton + + +# def _create_binary_propagating_op(name, is_divmod=False): + +# def method(self, other): +# if (other is C_NA or isinstance(other, str) +# or isinstance(other, (numbers.Number, np.bool_)) +# or isinstance(other, np.ndarray) and not other.shape): +# # Need the other.shape clause to handle NumPy scalars, +# # since we do a setitem on `out` below, which +# # won't work for NumPy scalars. +# if is_divmod: +# return NA, NA +# else: +# return NA + +# elif isinstance(other, np.ndarray): +# out = np.empty(other.shape, dtype=object) +# out[:] = NA + +# if is_divmod: +# return out, out.copy() +# else: +# return out + +# return NotImplemented + +# method.__name__ = name +# return method + + +# def _create_unary_propagating_op(name): +# def method(self): +# return NA + +# method.__name__ = name +# return method + + +# cdef class C_NAType: +# pass + + +# class NAType(C_NAType): +# """ +# NA ("not available") missing value indicator. + +# .. warning:: + +# Experimental: the behaviour of NA can still change without warning. + +# .. versionadded:: 1.0.0 + +# The NA singleton is a missing value indicator defined by pandas. It is +# used in certain new extension dtypes (currently the "string" dtype). +# """ + +# _instance = None + +# def __new__(cls, *args, **kwargs): +# if NAType._instance is None: +# NAType._instance = C_NAType.__new__(cls, *args, **kwargs) +# return NAType._instance + +# def __repr__(self) -> str: +# return "" + +# def __format__(self, format_spec) -> str: +# try: +# return self.__repr__().__format__(format_spec) +# except ValueError: +# return self.__repr__() + +# def __bool__(self): +# raise TypeError("boolean value of NA is ambiguous") + +# def __hash__(self): +# # GH 30013: Ensure hash is large enough to avoid hash collisions with integers +# exponent = 31 if is_32bit else 61 +# return 2 ** exponent - 1 + +# def __reduce__(self): +# return "NA" + +# # Binary arithmetic and comparison ops -> propagate + +# __add__ = _create_binary_propagating_op("__add__") +# __radd__ = _create_binary_propagating_op("__radd__") +# __sub__ = _create_binary_propagating_op("__sub__") +# __rsub__ = _create_binary_propagating_op("__rsub__") +# __mul__ = _create_binary_propagating_op("__mul__") +# __rmul__ = _create_binary_propagating_op("__rmul__") +# __matmul__ = _create_binary_propagating_op("__matmul__") +# __rmatmul__ = _create_binary_propagating_op("__rmatmul__") +# __truediv__ = _create_binary_propagating_op("__truediv__") +# __rtruediv__ = _create_binary_propagating_op("__rtruediv__") +# __floordiv__ = _create_binary_propagating_op("__floordiv__") +# __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__") +# __mod__ = _create_binary_propagating_op("__mod__") +# __rmod__ = _create_binary_propagating_op("__rmod__") +# __divmod__ = _create_binary_propagating_op("__divmod__", is_divmod=True) +# __rdivmod__ = _create_binary_propagating_op("__rdivmod__", is_divmod=True) +# # __lshift__ and __rshift__ are not implemented + +# __eq__ = _create_binary_propagating_op("__eq__") +# __ne__ = _create_binary_propagating_op("__ne__") +# __le__ = _create_binary_propagating_op("__le__") +# __lt__ = _create_binary_propagating_op("__lt__") +# __gt__ = _create_binary_propagating_op("__gt__") +# __ge__ = _create_binary_propagating_op("__ge__") + +# # Unary ops + +# __neg__ = _create_unary_propagating_op("__neg__") +# __pos__ = _create_unary_propagating_op("__pos__") +# __abs__ = _create_unary_propagating_op("__abs__") +# __invert__ = _create_unary_propagating_op("__invert__") + +# # pow has special +# def __pow__(self, other): +# if other is C_NA: +# return NA +# elif isinstance(other, (numbers.Number, np.bool_)): +# if other == 0: +# # returning positive is correct for +/- 0. +# return type(other)(1) +# else: +# return NA +# elif isinstance(other, np.ndarray): +# return np.where(other == 0, other.dtype.type(1), NA) + +# return NotImplemented + +# def __rpow__(self, other): +# if other is C_NA: +# return NA +# elif isinstance(other, (numbers.Number, np.bool_)): +# if other == 1: +# return other +# else: +# return NA +# elif isinstance(other, np.ndarray): +# return np.where(other == 1, other, NA) +# return NotImplemented + +# # Logical ops using Kleene logic + +# def __and__(self, other): +# if other is False: +# return False +# elif other is True or other is C_NA: +# return NA +# return NotImplemented + +# __rand__ = __and__ + +# def __or__(self, other): +# if other is True: +# return True +# elif other is False or other is C_NA: +# return NA +# return NotImplemented + +# __ror__ = __or__ + +# def __xor__(self, other): +# if other is False or other is True or other is C_NA: +# return NA +# return NotImplemented + +# __rxor__ = __xor__ + +# __array_priority__ = 1000 +# _HANDLED_TYPES = (np.ndarray, numbers.Number, str, np.bool_) + +# def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): +# types = self._HANDLED_TYPES + (NAType,) +# for x in inputs: +# if not isinstance(x, types): +# return NotImplemented + +# if method != "__call__": +# raise ValueError(f"ufunc method '{method}' not supported for NA") +# result = maybe_dispatch_ufunc_to_dunder_op( +# self, ufunc, method, *inputs, **kwargs +# ) +# if result is NotImplemented: +# # For a NumPy ufunc that's not a binop, like np.logaddexp +# index = [i for i, x in enumerate(inputs) if x is NA][0] +# result = np.broadcast_arrays(*inputs)[index] +# if result.ndim == 0: +# result = result.item() +# if ufunc.nout > 1: +# result = (NA,) * ufunc.nout + +# return result + + +# C_NA = NAType() # C-visible +# NA = C_NA # Python-visible diff --git a/pandas/_libs_numba/tslibs/__init__.py b/pandas/_libs_numba/tslibs/__init__.py new file mode 100644 index 0000000000000..6b9f2d32acd5c --- /dev/null +++ b/pandas/_libs_numba/tslibs/__init__.py @@ -0,0 +1,67 @@ +# __all__ = [ +# "dtypes", +# "localize_pydatetime", +# "NaT", +# "NaTType", +# "iNaT", +# "nat_strings", +# "is_null_datetimelike", +# "OutOfBoundsDatetime", +# "OutOfBoundsTimedelta", +# "IncompatibleFrequency", +# "Period", +# "Resolution", +# "Timedelta", +# "normalize_i8_timestamps", +# "is_date_array_normalized", +# "dt64arr_to_periodarr", +# "delta_to_nanoseconds", +# "ints_to_pydatetime", +# "ints_to_pytimedelta", +# "get_resolution", +# "Timestamp", +# "tz_convert_from_utc_single", +# "to_offset", +# "Tick", +# "BaseOffset", +# "tz_compare", +# ] + +# from pandas._libs.tslibs import dtypes +# from pandas._libs.tslibs.conversion import ( +# OutOfBoundsTimedelta, +# localize_pydatetime, +# ) +# from pandas._libs.tslibs.dtypes import Resolution +# from pandas._libs.tslibs.nattype import ( +# NaT, +# NaTType, +# iNaT, +# is_null_datetimelike, +# nat_strings, +# ) +# from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime +# from pandas._libs.tslibs.offsets import ( +# BaseOffset, +# Tick, +# to_offset, +# ) +# from pandas._libs.tslibs.period import ( +# IncompatibleFrequency, +# Period, +# ) +# from pandas._libs.tslibs.timedeltas import ( +# Timedelta, +# delta_to_nanoseconds, +# ints_to_pytimedelta, +# ) +# from pandas._libs.tslibs.timestamps import Timestamp +# from pandas._libs.tslibs.timezones import tz_compare +# from pandas._libs.tslibs.tzconversion import tz_convert_from_utc_single +# from pandas._libs.tslibs.vectorized import ( +# dt64arr_to_periodarr, +# get_resolution, +# ints_to_pydatetime, +# is_date_array_normalized, +# normalize_i8_timestamps, +# ) diff --git a/pandas/_libs_numba/tslibs/util.py b/pandas/_libs_numba/tslibs/util.py new file mode 100644 index 0000000000000..ecad25c3e0d81 --- /dev/null +++ b/pandas/_libs_numba/tslibs/util.py @@ -0,0 +1,223 @@ +import numpy as np + +# from cpython.object cimport PyTypeObject + + +# cdef extern from *: +# """ +# PyObject* char_to_string(const char* data) { +# return PyUnicode_FromString(data); +# } +# """ +# object char_to_string(const char* data) + + +# cdef extern from "Python.h": +# # Note: importing extern-style allows us to declare these as nogil +# # functions, whereas `from cpython cimport` does not. +# bint PyUnicode_Check(object obj) nogil +# bint PyBool_Check(object obj) nogil +# bint PyFloat_Check(object obj) nogil +# bint PyComplex_Check(object obj) nogil +# bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil + +# # Note that following functions can potentially raise an exception, +# # thus they cannot be declared 'nogil'. Also PyUnicode_AsUTF8AndSize() can +# # potentially allocate memory inside in unlikely case of when underlying +# # unicode object was stored as non-utf8 and utf8 wasn't requested before. +# const char* PyUnicode_AsUTF8AndSize(object obj, +# Py_ssize_t* length) except NULL + +# from numpy cimport ( +# float64_t, +# int64_t, +# ) + + +# cdef extern from "numpy/arrayobject.h": +# PyTypeObject PyFloatingArrType_Type + +# cdef extern from "numpy/ndarrayobject.h": +# PyTypeObject PyTimedeltaArrType_Type +# PyTypeObject PyDatetimeArrType_Type +# PyTypeObject PyComplexFloatingArrType_Type +# PyTypeObject PyBoolArrType_Type + +# bint PyArray_IsIntegerScalar(obj) nogil +# bint PyArray_Check(obj) nogil + +# cdef extern from "numpy/npy_common.h": +# int64_t NPY_MIN_INT64 + + +# cdef inline int64_t get_nat(): +# return NPY_MIN_INT64 + + +# -------------------------------------------------------------------- +# Type Checking + + +def is_integer_object(val: object) -> bool: + """ + Cython equivalent of + + `isinstance(val, (int, long, np.integer)) and not isinstance(val, bool)` + + Parameters + ---------- + val : object + + Returns + ------- + bool + """ + return ( + not isinstance(val, bool) + and isinstance(val, (int, np.integer)) + and not is_timedelta64_object(val) + ) + + +# cdef inline bint is_float_object(object obj) nogil: +# """ +# Cython equivalent of `isinstance(val, (float, np.complex_))` + +# Parameters +# ---------- +# val : object + +# Returns +# ------- +# is_float : bool +# """ +# return (PyFloat_Check(obj) or +# (PyObject_TypeCheck(obj, &PyFloatingArrType_Type))) + + +# cdef inline bint is_complex_object(object obj) nogil: +# """ +# Cython equivalent of `isinstance(val, (complex, np.complex_))` + +# Parameters +# ---------- +# val : object + +# Returns +# ------- +# is_complex : bool +# """ +# return (PyComplex_Check(obj) or +# PyObject_TypeCheck(obj, &PyComplexFloatingArrType_Type)) + + +# cdef inline bint is_bool_object(object obj) nogil: +# """ +# Cython equivalent of `isinstance(val, (bool, np.bool_))` + +# Parameters +# ---------- +# val : object + +# Returns +# ------- +# is_bool : bool +# """ +# return (PyBool_Check(obj) or +# PyObject_TypeCheck(obj, &PyBoolArrType_Type)) + + +# cdef inline bint is_real_number_object(object obj) nogil: +# return is_bool_object(obj) or is_integer_object(obj) or is_float_object(obj) + + +def is_timedelta64_object(val: object) -> bool: + """ + Cython equivalent of `isinstance(val, np.timedelta64)` + + Parameters + ---------- + val : object + + Returns + ------- + bool + """ + return isinstance(val, np.timedelta64) + + +# cdef inline bint is_datetime64_object(object obj) nogil: +# """ +# Cython equivalent of `isinstance(val, np.datetime64)` + +# Parameters +# ---------- +# val : object + +# Returns +# ------- +# is_datetime64 : bool +# """ +# return PyObject_TypeCheck(obj, &PyDatetimeArrType_Type) + + +# cdef inline bint is_array(object val): +# """ +# Cython equivalent of `isinstance(val, np.ndarray)` + +# Parameters +# ---------- +# val : object + +# Returns +# ------- +# is_ndarray : bool +# """ +# return PyArray_Check(val) + + +# cdef inline bint is_nan(object val): +# """ +# Check if val is a Not-A-Number float or complex, including +# float('NaN') and np.nan. + +# Parameters +# ---------- +# val : object + +# Returns +# ------- +# is_nan : bool +# """ +# cdef float64_t fval +# if is_float_object(val): +# fval = val +# return fval != fval +# return is_complex_object(val) and val != val + + +# cdef inline const char* get_c_string_buf_and_size(str py_string, +# Py_ssize_t *length) except NULL: +# """ +# Extract internal char* buffer of unicode or bytes object `py_string` with +# getting length of this internal buffer saved in `length`. + +# Notes +# ----- +# Python object owns memory, thus returned char* must not be freed. +# `length` can be NULL if getting buffer length is not needed. + +# Parameters +# ---------- +# py_string : str +# length : Py_ssize_t* + +# Returns +# ------- +# buf : const char* +# """ +# return PyUnicode_AsUTF8AndSize(py_string, length) + + +# cdef inline const char* get_c_string(str py_string) except NULL: +# return get_c_string_buf_and_size(py_string, NULL) diff --git a/pandas/_libs_numba/util.py b/pandas/_libs_numba/util.py new file mode 100644 index 0000000000000..56239126279ff --- /dev/null +++ b/pandas/_libs_numba/util.py @@ -0,0 +1,50 @@ +# cimport numpy as cnp +# from numpy cimport ndarray + +from pandas._libs_numba.tslibs.util import * # noqa + +# cdef extern from "numpy/ndarraytypes.h": +# void PyArray_CLEARFLAGS(ndarray arr, int flags) nogil + + +# cdef extern from "numpy/arrayobject.h": +# enum: +# NPY_ARRAY_C_CONTIGUOUS +# NPY_ARRAY_F_CONTIGUOUS + + +# cdef extern from "src/headers/stdint.h": +# enum: UINT8_MAX +# enum: UINT16_MAX +# enum: UINT32_MAX +# enum: UINT64_MAX +# enum: INT8_MIN +# enum: INT8_MAX +# enum: INT16_MIN +# enum: INT16_MAX +# enum: INT32_MAX +# enum: INT32_MIN +# enum: INT64_MAX +# enum: INT64_MIN + + +# ctypedef fused numeric: +# cnp.int8_t +# cnp.int16_t +# cnp.int32_t +# cnp.int64_t + +# cnp.uint8_t +# cnp.uint16_t +# cnp.uint32_t +# cnp.uint64_t + +# cnp.float32_t +# cnp.float64_t + + +# cdef inline void set_array_not_contiguous(ndarray ao) nogil: +# # Numpy>=1.8-compliant equivalent to: +# # ao->flags &= ~(NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS); +# PyArray_CLEARFLAGS(ao, +# (NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS)) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 15f54c11be0a0..4d8631283e0ab 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -8,6 +8,7 @@ from textwrap import dedent from typing import ( TYPE_CHECKING, + Any, Dict, Optional, Tuple, @@ -28,6 +29,7 @@ iNaT, lib, ) +from pandas._libs_numba import algos as algos_numba from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -1305,7 +1307,7 @@ def compute(self, method: str) -> Series: narr = len(arr) n = min(n, narr) - kth_val = algos.kth_smallest(arr.copy(), n - 1) + kth_val = algos_numba.kth_smallest(arr.copy(), n - 1) (ns,) = np.nonzero(arr <= kth_val) inds = ns[arr[ns].argsort(kind="mergesort")] @@ -1634,7 +1636,7 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): """ n = int(n) - na = np.nan + na: Any = np.nan dtype = arr.dtype is_bool = is_bool_dtype(dtype) @@ -1666,9 +1668,9 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): is_timedelta = False if needs_i8_conversion(arr.dtype): - dtype = np.int64 - arr = arr.view("i8") - na = iNaT + dtype = np.dtype("timedelta64[ns]") + arr = getattr(arr, "_data", arr) + na = None is_timedelta = True elif is_bool: @@ -1698,10 +1700,11 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None) out_arr[tuple(na_indexer)] = na - if arr.dtype.name in _diff_special: + if arr.dtype.name in _diff_special or is_timedelta: + assert isinstance(arr, np.ndarray), type(arr) # TODO: can diff_2d dtype specialization troubles be fixed by defining # out_arr inside diff_2d? - algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta) + algos_numba.diff_2d(arr, out_arr, n, axis) else: # To keep mypy happy, _res_indexer is a list while res_indexer is # a tuple, ditto for lag_indexer. @@ -1715,9 +1718,6 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): out_arr[res_indexer] = op(arr[res_indexer], arr[lag_indexer]) - if is_timedelta: - out_arr = out_arr.view("timedelta64[ns]") - if orig_ndim == 1: out_arr = out_arr[:, 0] return out_arr diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index ba1b2a0f0e76e..efbb628479f83 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -13,6 +13,7 @@ algos as libalgos, lib, ) +from pandas._libs_numba import algos as libalgos_numba from pandas._typing import ArrayLike from pandas.core.dtypes.cast import maybe_promote @@ -354,27 +355,31 @@ def wrapper( _take_1d_dict = { - ("int8", "int8"): libalgos.take_1d_int8_int8, - ("int8", "int32"): libalgos.take_1d_int8_int32, - ("int8", "int64"): libalgos.take_1d_int8_int64, - ("int8", "float64"): libalgos.take_1d_int8_float64, - ("int16", "int16"): libalgos.take_1d_int16_int16, - ("int16", "int32"): libalgos.take_1d_int16_int32, - ("int16", "int64"): libalgos.take_1d_int16_int64, - ("int16", "float64"): libalgos.take_1d_int16_float64, - ("int32", "int32"): libalgos.take_1d_int32_int32, - ("int32", "int64"): libalgos.take_1d_int32_int64, - ("int32", "float64"): libalgos.take_1d_int32_float64, - ("int64", "int64"): libalgos.take_1d_int64_int64, - ("int64", "float64"): libalgos.take_1d_int64_float64, - ("float32", "float32"): libalgos.take_1d_float32_float32, - ("float32", "float64"): libalgos.take_1d_float32_float64, - ("float64", "float64"): libalgos.take_1d_float64_float64, - ("object", "object"): libalgos.take_1d_object_object, - ("bool", "bool"): _view_wrapper(libalgos.take_1d_bool_bool, np.uint8, np.uint8), - ("bool", "object"): _view_wrapper(libalgos.take_1d_bool_object, np.uint8, None), + ("int8", "int8"): libalgos_numba.take_1d_int8_int8, + ("int8", "int32"): libalgos_numba.take_1d_int8_int32, + ("int8", "int64"): libalgos_numba.take_1d_int8_int64, + ("int8", "float64"): libalgos_numba.take_1d_int8_float64, + ("int16", "int16"): libalgos_numba.take_1d_int16_int16, + ("int16", "int32"): libalgos_numba.take_1d_int16_int32, + ("int16", "int64"): libalgos_numba.take_1d_int16_int64, + ("int16", "float64"): libalgos_numba.take_1d_int16_float64, + ("int32", "int32"): libalgos_numba.take_1d_int32_int32, + ("int32", "int64"): libalgos_numba.take_1d_int32_int64, + ("int32", "float64"): libalgos_numba.take_1d_int32_float64, + ("int64", "int64"): libalgos_numba.take_1d_int64_int64, + ("int64", "float64"): libalgos_numba.take_1d_int64_float64, + ("float32", "float32"): libalgos_numba.take_1d_float32_float32, + ("float32", "float64"): libalgos_numba.take_1d_float32_float64, + ("float64", "float64"): libalgos_numba.take_1d_float64_float64, + ("object", "object"): libalgos_numba.take_1d_object_object, + ("bool", "bool"): _view_wrapper( + libalgos_numba.take_1d_bool_bool, np.uint8, np.uint8 + ), + ("bool", "object"): _view_wrapper( + libalgos_numba.take_1d_bool_object, np.uint8, None + ), ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( - libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64 + libalgos_numba.take_1d_int64_int64, np.int64, np.int64, np.int64 ), } diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 769ae52744c74..5b6b3fb778cfd 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -24,10 +24,10 @@ from pandas._libs import ( NaT, - algos as libalgos, hashtable as htable, ) from pandas._libs.lib import no_default +from pandas._libs_numba import algos as libalgos from pandas._typing import ( ArrayLike, Dtype, @@ -49,6 +49,7 @@ from pandas.core.dtypes.common import ( ensure_int64, ensure_object, + ensure_platform_int, is_categorical_dtype, is_datetime64_dtype, is_dict_like, @@ -533,7 +534,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: # error: Incompatible types in assignment (expression has type "ndarray", # variable has type "Categorical") result = take_nd( # type: ignore[assignment] - new_cats, libalgos.ensure_platform_int(self._codes) + new_cats, ensure_platform_int(self._codes) ) return result diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 0900688e04374..b79ebfad380d6 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -22,10 +22,7 @@ import numpy as np -from pandas._libs import ( - algos, - lib, -) +from pandas._libs import lib from pandas._libs.tslibs import ( BaseOffset, IncompatibleFrequency, @@ -44,6 +41,7 @@ round_nsint64, ) from pandas._libs.tslibs.timestamps import integer_op_not_supported +from pandas._libs_numba import algos from pandas._typing import ( ArrayLike, DatetimeLikeScalar, @@ -1030,11 +1028,11 @@ def _generate_range( @property def _is_monotonic_increasing(self) -> bool: - return algos.is_monotonic(self.asi8, timelike=True)[0] + return algos.is_monotonic(self._data)[0] @property def _is_monotonic_decreasing(self) -> bool: - return algos.is_monotonic(self.asi8, timelike=True)[1] + return algos.is_monotonic(self._data)[1] @property def _is_unique(self) -> bool: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 32ea82d9c0402..e2739c4487e06 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -14,9 +14,9 @@ from pandas._libs import ( Interval, Period, - algos, ) from pandas._libs.tslibs import conversion +from pandas._libs_numba import algos from pandas._typing import ( ArrayLike, DtypeObj, diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 8c2cff21c114e..ee4518903ee0d 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -1,6 +1,8 @@ """ missing types & inference """ +from __future__ import annotations + from decimal import Decimal from functools import partial @@ -15,6 +17,7 @@ Period, iNaT, ) +import pandas._libs_numba.missing as libmissing_numba from pandas._typing import ( ArrayLike, DtypeObj, @@ -54,7 +57,7 @@ INF_AS_NA = False -def isna(obj): +def isna(obj: object): """ Detect missing values for an array-like object. @@ -137,7 +140,7 @@ def isna(obj): isnull = isna -def _isna(obj, inf_as_na: bool = False): +def _isna(obj: object, inf_as_na: bool = False): """ Detect missing values, treating None, NaN or NA as null. Infinite values will also be treated as null if inf_as_na is True. @@ -157,7 +160,7 @@ def _isna(obj, inf_as_na: bool = False): if inf_as_na: return libmissing.checknull_old(obj) else: - return libmissing.checknull(obj) + return libmissing_numba.checknull(obj) # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e8b83af16254a..9ce6a60b17d6b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -27,7 +27,6 @@ import numpy as np from pandas._libs import ( - algos as libalgos, index as libindex, lib, ) @@ -42,6 +41,7 @@ Timestamp, tz_compare, ) +from pandas._libs_numba import algos as libalgos from pandas._typing import ( AnyArrayLike, ArrayLike, diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 244fcb9f49ec6..d040a6aff6df2 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -23,11 +23,11 @@ from pandas._config import get_option from pandas._libs import ( - algos as libalgos, index as libindex, lib, ) from pandas._libs.hashtable import duplicated_int64 +from pandas._libs_numba import algos as libalgos from pandas._typing import ( AnyArrayLike, DtypeObj, @@ -1583,9 +1583,7 @@ def is_monotonic_increasing(self) -> bool: if all(level.is_monotonic for level in self.levels): # If each level is sorted, we can operate on the codes directly. GH27495 - return libalgos.is_lexsorted( - [x.astype("int64", copy=False) for x in self.codes] - ) + return libalgos.is_lexsorted(self.codes) # reversed() because lexsort() wants the most significant key last. values = [ @@ -3822,9 +3820,8 @@ def isin(self, values, level=None) -> np.ndarray: def _lexsort_depth(codes: List[np.ndarray], nlevels: int) -> int: """Count depth (up to a maximum of `nlevels`) with which codes are lexsorted.""" - int64_codes = [ensure_int64(level_codes) for level_codes in codes] for k in range(nlevels, 0, -1): - if libalgos.is_lexsorted(int64_codes[:k]): + if libalgos.is_lexsorted(codes[:k]): return k return 0 diff --git a/pandas/core/missing.py b/pandas/core/missing.py index c2193056cc974..c58e746ba0fdf 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -3,10 +3,7 @@ """ from __future__ import annotations -from functools import ( - partial, - wraps, -) +from functools import partial from typing import ( TYPE_CHECKING, Any, @@ -14,19 +11,15 @@ Optional, Set, Union, - cast, ) import numpy as np -from pandas._libs import ( - algos, - lib, -) +from pandas._libs import lib +from pandas._libs_numba import algos from pandas._typing import ( ArrayLike, Axis, - F, ) from pandas.compat._optional import import_optional_dependency @@ -252,7 +245,7 @@ def interpolate_1d( ) # default limit is unlimited GH #16282 - limit = algos.validate_limit(nobs=None, limit=limit) + algos.validate_limit(limit=limit) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... all_nans = set(np.flatnonzero(invalid)) @@ -679,61 +672,25 @@ def interpolate_2d( return result -def _fillna_prep(values, mask: Optional[np.ndarray] = None) -> np.ndarray: - # boilerplate for _pad_1d, _backfill_1d, _pad_2d, _backfill_2d - +def _pad_1d(values, limit=None, mask=None): if mask is None: mask = isna(values) - - mask = mask.view(np.uint8) - return mask - - -def _datetimelike_compat(func: F) -> F: - """ - Wrapper to handle datetime64 and timedelta64 dtypes. - """ - - @wraps(func) - def new_func(values, limit=None, mask=None): - if needs_i8_conversion(values.dtype): - if mask is None: - # This needs to occur before casting to int64 - mask = isna(values) - - result, mask = func(values.view("i8"), limit=limit, mask=mask) - return result.view(values.dtype), mask - - return func(values, limit=limit, mask=mask) - - return cast(F, new_func) - - -@_datetimelike_compat -def _pad_1d( - values: np.ndarray, - limit: int | None = None, - mask: np.ndarray | None = None, -) -> tuple[np.ndarray, np.ndarray]: - mask = _fillna_prep(values, mask) algos.pad_inplace(values, mask, limit=limit) return values, mask -@_datetimelike_compat def _backfill_1d( values: np.ndarray, limit: int | None = None, mask: np.ndarray | None = None, ) -> tuple[np.ndarray, np.ndarray]: - mask = _fillna_prep(values, mask) - algos.backfill_inplace(values, mask, limit=limit) - return values, mask + _, new_mask = _pad_1d(values[::-1], limit, mask[::-1] if mask is not None else None) + return values, (mask if mask is not None else new_mask) -@_datetimelike_compat def _pad_2d(values, limit=None, mask=None): - mask = _fillna_prep(values, mask) + if mask is None: + mask = isna(values) if np.all(values.shape): algos.pad_2d_inplace(values, mask, limit=limit) @@ -743,16 +700,11 @@ def _pad_2d(values, limit=None, mask=None): return values, mask -@_datetimelike_compat def _backfill_2d(values, limit=None, mask=None): - mask = _fillna_prep(values, mask) - - if np.all(values.shape): - algos.backfill_2d_inplace(values, mask, limit=limit) - else: - # for test coverage - pass - return values, mask + _, new_mask = _pad_2d( + values[:, ::-1], limit, mask[:, ::-1] if mask is not None else None + ) + return values, (mask if mask is not None else new_mask) _fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d} diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 3aa4d26f7dc8f..12dab0041d4d1 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -18,11 +18,11 @@ import numpy as np from pandas._libs import ( - algos, hashtable, lib, ) from pandas._libs.hashtable import unique_label_indices +from pandas._libs_numba import algos from pandas._typing import IndexKeyFunc from pandas.core.dtypes.common import ( diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 11bb554a0dc5a..3f92d66c36ca9 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -192,6 +192,7 @@ class TestPDApi(Base): "_hashtable", "_lib", "_libs", + "_libs_numba", "_np_version_under1p17", "_np_version_under1p18", "_is_numpy_dev", diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py index 1a12cbff47092..7a626ce6312c5 100644 --- a/pandas/tests/apply/test_frame_transform.py +++ b/pandas/tests/apply/test_frame_transform.py @@ -162,8 +162,6 @@ def test_transform_method_name(method): frame_kernels_raise = [x for x in frame_transform_kernels if x not in wont_fail] -# mypy doesn't allow adding lists of different types -# https://github.com/python/mypy/issues/5492 @pytest.mark.parametrize("op", [*frame_kernels_raise, lambda x: x + 1]) def test_transform_bad_dtype(op, frame_or_series): # GH 35964 diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c8df18ddaeebe..4778fdfa0cb99 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -9,6 +9,7 @@ algos as libalgos, hashtable as ht, ) +from pandas._libs_numba import algos as libalgos_numba from pandas.compat import np_array_datetime64_compat import pandas.util._test_decorators as td @@ -2112,14 +2113,14 @@ def test_is_lexsorted(): ), ] - assert not libalgos.is_lexsorted(failure) + assert not libalgos_numba.is_lexsorted(failure) def test_groupsort_indexer(): a = np.random.randint(0, 1000, 100).astype(np.int64) b = np.random.randint(0, 1000, 100).astype(np.int64) - result = libalgos.groupsort_indexer(a, 1000)[0] + result = libalgos_numba.groupsort_indexer(a, 1000)[0] # need to use a stable sort # np.argsort returns int, groupsort_indexer @@ -2133,7 +2134,7 @@ def test_groupsort_indexer(): # np.lexsort returns int, groupsort_indexer # always returns intp key = a * 1000 + b - result = libalgos.groupsort_indexer(key, 1000000)[0] + result = libalgos_numba.groupsort_indexer(key, 1000000)[0] expected = np.lexsort((b, a)) expected = expected.astype(np.intp) @@ -2195,7 +2196,7 @@ def test_infinity_against_nan(): def test_ensure_platform_int(): arr = np.arange(100, dtype=np.intp) - result = libalgos.ensure_platform_int(arr) + result = libalgos_numba.ensure_platform_int(arr) assert result is arr diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index c5b875b8f027e..22a4379764137 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -3,7 +3,6 @@ import numpy as np -from pandas._libs.algos import unique_deltas from pandas._libs.tslibs import ( Timestamp, tzconversion, @@ -26,6 +25,7 @@ to_offset, ) from pandas._libs.tslibs.parsing import get_rule_month +from pandas._libs_numba.algos import unique_deltas from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( diff --git a/setup.cfg b/setup.cfg index a0b6a0cdfc260..bcd83f3056f35 100644 --- a/setup.cfg +++ b/setup.cfg @@ -76,6 +76,7 @@ ignore = W504, # line break after binary operator E402, # module level import not at top of file E731, # do not assign a lambda expression, use a def + E741, # ambiguous variable name S001 # found modulo formatter (incorrect picks up mod operations) exclude = doc/sphinxext/*.py, @@ -162,7 +163,7 @@ directory = coverage_html_report # To be kept consistent with "Import Formatting" section in contributing.rst [isort] known_pre_libs = pandas._config -known_pre_core = pandas._libs,pandas._typing,pandas.util._*,pandas.compat,pandas.errors +known_pre_core = pandas._libs,pandas._libs_numba,pandas._typing,pandas.util._*,pandas.compat,pandas.errors known_dtypes = pandas.core.dtypes known_post_core = pandas.tseries,pandas.io,pandas.plotting sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER