From d0fb953b2b31b7b17f63586e787748ca9b6d18f5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 17:50:27 -0700 Subject: [PATCH 1/3] REF: remove algos_rank_helper --- pandas/_libs/algos.pyx | 407 ++++++++++++++++++++++++- pandas/_libs/algos_rank_helper.pxi.in | 410 -------------------------- pandas/_libs/algos_take_helper.pxi.in | 299 ------------------- setup.py | 1 - 4 files changed, 406 insertions(+), 711 deletions(-) delete mode 100644 pandas/_libs/algos_rank_helper.pxi.in delete mode 100644 pandas/_libs/algos_take_helper.pxi.in diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 0f91f612994c7..a4980633b5d1c 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -771,7 +771,412 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): return is_monotonic_inc, is_monotonic_dec, is_strict_monotonic +# ---------------------------------------------------------------------- +# rank_1d, rank_2d +# ---------------------------------------------------------------------- + +ctypedef fused rank_t: + object + float64_t + uint64_t + int64_t + + +@cython.wraparound(False) +@cython.boundscheck(False) +def rank_1d(rank_t[:] in_arr, ties_method='average', + ascending=True, na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 + + ndarray[rank_t] sorted_data, values + + ndarray[float64_t] ranks + ndarray[int64_t] argsorted + ndarray[uint8_t, cast=True] sorted_mask + + rank_t val, nan_value + + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + bint isnan, condition + float64_t count = 0.0 + + tiebreak = tiebreakers[ties_method] + + if rank_t is float64_t: + values = np.asarray(in_arr).copy() + elif rank_t is object: + values = np.array(in_arr, copy=True) + + if values.dtype != np.object_: + values = values.astype('O') + else: + values = np.asarray(in_arr) + + keep_na = na_option == 'keep' + + if rank_t is object: + mask = missing.isnaobj(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT + + # create copy in case of NPY_NAT + # values are mutated inplace + if mask.any(): + values = values.copy() + + # double sort first by mask and then by values to ensure nan values are + # either at the beginning or the end. mask/(~mask) controls padding at + # tail or the head + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max + + order = (values, mask) + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).min + + order = (values, ~mask) + np.putmask(values, mask, nan_value) + else: + mask = np.zeros(shape=len(values), dtype=bool) + order = (values, mask) + + n = len(values) + ranks = np.empty(n, dtype='f8') + + if rank_t is object: + _as = np.lexsort(keys=order) + else: + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = np.lexsort(keys=order) + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = np.lexsort(keys=order) + + if not ascending: + _as = _as[::-1] + + sorted_data = values.take(_as) + sorted_mask = mask.take(_as) + _indices = np.diff(sorted_mask.astype(int)).nonzero()[0] + non_na_idx = _indices[0] if len(_indices) > 0 else -1 + argsorted = _as.astype('i8') + + if rank_t is object: + # TODO: de-duplicate once cython supports conditional nogil + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = sorted_data[i] + + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue + + count += 1.0 + + if rank_t is object: + condition = (i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx) + else: + condition = (i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx) + + if condition: + + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + + else: + with nogil: + # TODO: why does the 2d version not have a nogil block? + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = sorted_data[i] + + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue + + count += 1.0 + + if rank_t is object: + condition = (i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx) + else: + condition = (i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx) + + if condition: + + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + + if pct: + if tiebreak == TIEBREAK_DENSE: + return ranks / total_tie_count + else: + return ranks / count + else: + return ranks + + +rank_1d_object = rank_1d["object"] +rank_1d_float64 = rank_1d["float64_t"] +rank_1d_uint64 = rank_1d["uint64_t"] +rank_1d_int64 = rank_1d["int64_t"] + + +def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', + ascending=True, na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 + + Py_ssize_t infs + + ndarray[float64_t, ndim=2] ranks + ndarray[rank_t, ndim=2] values + + ndarray[int64_t, ndim=2] argsorted + + rank_t val, nan_value + + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + float64_t count = 0.0 + bint condition, skip_condition + + tiebreak = tiebreakers[ties_method] + + keep_na = na_option == 'keep' + + if axis == 0: + values = np.asarray(in_arr).T.copy() + else: + values = np.asarray(in_arr).copy() + + if rank_t is object: + if values.dtype != np.object_: + values = values.astype('O') + + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max + + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = NPY_NAT + + if rank_t is object: + mask = missing.isnaobj2d(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT + + np.putmask(values, mask, nan_value) + + n, k = (values).shape + ranks = np.empty((n, k), dtype='f8') + + if rank_t is object: + try: + _as = values.argsort(1) + except TypeError: + values = in_arr + for i in range(len(values)): + ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, + ascending=ascending, pct=pct) + if axis == 0: + return ranks.T + else: + return ranks + else: + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) + + if not ascending: + _as = _as[:, ::-1] + + values = _take_2d(values, _as) + argsorted = _as.astype('i8') + + for i in range(n): + if rank_t is object: + dups = sum_ranks = infs = 0 + else: + dups = sum_ranks = 0 + + total_tie_count = 0 + count = 0.0 + for j in range(k): + if rank_t is not object: + sum_ranks += j + 1 + dups += 1 + + val = values[i, j] + + if rank_t is not uint64_t: + if rank_t is object: + skip_condition = (val is nan_value) and keep_na + else: + skip_condition = (val == nan_value) and keep_na + if skip_condition: + ranks[i, argsorted[i, j]] = NaN + + if rank_t is object: + infs += 1 + + continue + + count += 1.0 + + if rank_t is object: + sum_ranks += (j - infs) + 1 + dups += 1 + + if rank_t is object: + condition = j == k - 1 or are_diff(values[i, j + 1], val) + else: + condition = j == k - 1 or values[i, j + 1] != val + + if condition: + if tiebreak == TIEBREAK_AVERAGE: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported ' + 'for non-numeric data') + else: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = z + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count + sum_ranks = dups = 0 + if pct: + if tiebreak == TIEBREAK_DENSE: + ranks[i, :] /= total_tie_count + else: + ranks[i, :] /= count + if axis == 0: + return ranks.T + else: + return ranks + + +rank_2d_object = rank_2d["object"] +rank_2d_float64 = rank_2d["float64_t"] +rank_2d_uint64 = rank_2d["uint64_t"] +rank_2d_int64 = rank_2d["int64_t"] + + # generated from template include "algos_common_helper.pxi" -include "algos_rank_helper.pxi" include "algos_take_helper.pxi" diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in deleted file mode 100644 index d5a31b6a13010..0000000000000 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ /dev/null @@ -1,410 +0,0 @@ -""" -Template for each `dtype` helper function for rank - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -# ---------------------------------------------------------------------- -# rank_1d, rank_2d -# ---------------------------------------------------------------------- - -ctypedef fused rank_t: - object - float64_t - uint64_t - int64_t - - -@cython.wraparound(False) -@cython.boundscheck(False) -def rank_1d(rank_t[:] in_arr, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 - - ndarray[rank_t] sorted_data, values - - ndarray[float64_t] ranks - ndarray[int64_t] argsorted - ndarray[uint8_t, cast=True] sorted_mask - - rank_t val, nan_value - - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - bint isnan, condition - float64_t count = 0.0 - - tiebreak = tiebreakers[ties_method] - - if rank_t is float64_t: - values = np.asarray(in_arr).copy() - elif rank_t is object: - values = np.array(in_arr, copy=True) - - if values.dtype != np.object_: - values = values.astype('O') - else: - values = np.asarray(in_arr) - - keep_na = na_option == 'keep' - - if rank_t is object: - mask = missing.isnaobj(values) - elif rank_t is float64_t: - mask = np.isnan(values) - elif rank_t is int64_t: - mask = values == NPY_NAT - - # create copy in case of NPY_NAT - # values are mutated inplace - if mask.any(): - values = values.copy() - - # double sort first by mask and then by values to ensure nan values are - # either at the beginning or the end. mask/(~mask) controls padding at - # tail or the head - if rank_t is not uint64_t: - if ascending ^ (na_option == 'top'): - if rank_t is object: - nan_value = Infinity() - elif rank_t is float64_t: - nan_value = np.inf - elif rank_t is int64_t: - nan_value = np.iinfo(np.int64).max - - order = (values, mask) - else: - if rank_t is object: - nan_value = NegInfinity() - elif rank_t is float64_t: - nan_value = -np.inf - elif rank_t is int64_t: - nan_value = np.iinfo(np.int64).min - - order = (values, ~mask) - np.putmask(values, mask, nan_value) - else: - mask = np.zeros(shape=len(values), dtype=bool) - order = (values, mask) - - n = len(values) - ranks = np.empty(n, dtype='f8') - - if rank_t is object: - _as = np.lexsort(keys=order) - else: - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = np.lexsort(keys=order) - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = np.lexsort(keys=order) - - if not ascending: - _as = _as[::-1] - - sorted_data = values.take(_as) - sorted_mask = mask.take(_as) - _indices = np.diff(sorted_mask.astype(int)).nonzero()[0] - non_na_idx = _indices[0] if len(_indices) > 0 else -1 - argsorted = _as.astype('i8') - - if rank_t is object: - # TODO: de-duplicate once cython supports conditional nogil - for i in range(n): - sum_ranks += i + 1 - dups += 1 - - val = sorted_data[i] - - if rank_t is not uint64_t: - isnan = sorted_mask[i] - if isnan and keep_na: - ranks[argsorted[i]] = NaN - continue - - count += 1.0 - - if rank_t is object: - condition = (i == n - 1 or - are_diff(sorted_data[i + 1], val) or - i == non_na_idx) - else: - condition = (i == n - 1 or - sorted_data[i + 1] != val or - i == non_na_idx) - - if condition: - - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported for ' - 'non-numeric data') - else: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 - - else: - with nogil: - # TODO: why does the 2d version not have a nogil block? - for i in range(n): - sum_ranks += i + 1 - dups += 1 - - val = sorted_data[i] - - if rank_t is not uint64_t: - isnan = sorted_mask[i] - if isnan and keep_na: - ranks[argsorted[i]] = NaN - continue - - count += 1.0 - - if rank_t is object: - condition = (i == n - 1 or - are_diff(sorted_data[i + 1], val) or - i == non_na_idx) - else: - condition = (i == n - 1 or - sorted_data[i + 1] != val or - i == non_na_idx) - - if condition: - - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported for ' - 'non-numeric data') - else: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 - - if pct: - if tiebreak == TIEBREAK_DENSE: - return ranks / total_tie_count - else: - return ranks / count - else: - return ranks - - -rank_1d_object = rank_1d["object"] -rank_1d_float64 = rank_1d["float64_t"] -rank_1d_uint64 = rank_1d["uint64_t"] -rank_1d_int64 = rank_1d["int64_t"] - - -def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - - Py_ssize_t infs - - ndarray[float64_t, ndim=2] ranks - ndarray[rank_t, ndim=2] values - - ndarray[int64_t, ndim=2] argsorted - - rank_t val, nan_value - - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - float64_t count = 0.0 - bint condition, skip_condition - - tiebreak = tiebreakers[ties_method] - - keep_na = na_option == 'keep' - - if axis == 0: - values = np.asarray(in_arr).T.copy() - else: - values = np.asarray(in_arr).copy() - - if rank_t is object: - if values.dtype != np.object_: - values = values.astype('O') - - if rank_t is not uint64_t: - if ascending ^ (na_option == 'top'): - if rank_t is object: - nan_value = Infinity() - elif rank_t is float64_t: - nan_value = np.inf - elif rank_t is int64_t: - nan_value = np.iinfo(np.int64).max - - else: - if rank_t is object: - nan_value = NegInfinity() - elif rank_t is float64_t: - nan_value = -np.inf - elif rank_t is int64_t: - nan_value = NPY_NAT - - if rank_t is object: - mask = missing.isnaobj2d(values) - elif rank_t is float64_t: - mask = np.isnan(values) - elif rank_t is int64_t: - mask = values == NPY_NAT - - np.putmask(values, mask, nan_value) - - n, k = (values).shape - ranks = np.empty((n, k), dtype='f8') - - if rank_t is object: - try: - _as = values.argsort(1) - except TypeError: - values = in_arr - for i in range(len(values)): - ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, - ascending=ascending, pct=pct) - if axis == 0: - return ranks.T - else: - return ranks - else: - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(axis=1, kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = values.argsort(1) - - if not ascending: - _as = _as[:, ::-1] - - values = _take_2d(values, _as) - argsorted = _as.astype('i8') - - for i in range(n): - if rank_t is object: - dups = sum_ranks = infs = 0 - else: - dups = sum_ranks = 0 - - total_tie_count = 0 - count = 0.0 - for j in range(k): - if rank_t is not object: - sum_ranks += j + 1 - dups += 1 - - val = values[i, j] - - if rank_t is not uint64_t: - if rank_t is object: - skip_condition = (val is nan_value) and keep_na - else: - skip_condition = (val == nan_value) and keep_na - if skip_condition: - ranks[i, argsorted[i, j]] = NaN - - if rank_t is object: - infs += 1 - - continue - - count += 1.0 - - if rank_t is object: - sum_ranks += (j - infs) + 1 - dups += 1 - - if rank_t is object: - condition = j == k - 1 or are_diff(values[i, j + 1], val) - else: - condition = j == k - 1 or values[i, j + 1] != val - - if condition: - if tiebreak == TIEBREAK_AVERAGE: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported ' - 'for non-numeric data') - else: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = z + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = total_tie_count - sum_ranks = dups = 0 - if pct: - if tiebreak == TIEBREAK_DENSE: - ranks[i, :] /= total_tie_count - else: - ranks[i, :] /= count - if axis == 0: - return ranks.T - else: - return ranks - - -rank_2d_object = rank_2d["object"] -rank_2d_float64 = rank_2d["float64_t"] -rank_2d_uint64 = rank_2d["uint64_t"] -rank_2d_int64 = rank_2d["int64_t"] diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in deleted file mode 100644 index e7ee212065c5b..0000000000000 --- a/pandas/_libs/algos_take_helper.pxi.in +++ /dev/null @@ -1,299 +0,0 @@ -""" -Template for each `dtype` helper function for take - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -# ---------------------------------------------------------------------- -# take_1d, take_2d -# ---------------------------------------------------------------------- - -{{py: - -# name, dest, c_type_in, c_type_out, preval, postval, can_copy, nogil -dtypes = [ - ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True), - ('bool', 'object', 'uint8_t', 'object', - 'True if ', ' > 0 else False', False), - ('int8', 'int8', 'int8_t', 'int8_t', '', '', True), - ('int8', 'int32', 'int8_t', 'int32_t', '', '', False), - ('int8', 'int64', 'int8_t', 'int64_t', '', '', False), - ('int8', 'float64', 'int8_t', 'float64_t', '', '', False), - ('int16', 'int16', 'int16_t', 'int16_t', '', '', True), - ('int16', 'int32', 'int16_t', 'int32_t', '', '', False), - ('int16', 'int64', 'int16_t', 'int64_t', '', '', False), - ('int16', 'float64', 'int16_t', 'float64_t', '', '', False), - ('int32', 'int32', 'int32_t', 'int32_t', '', '', True), - ('int32', 'int64', 'int32_t', 'int64_t', '', '', False), - ('int32', 'float64', 'int32_t', 'float64_t', '', '', False), - ('int64', 'int64', 'int64_t', 'int64_t', '', '', True), - ('int64', 'float64', 'int64_t', 'float64_t', '', '', False), - ('float32', 'float32', 'float32_t', 'float32_t', '', '', True), - ('float32', 'float64', 'float32_t', 'float64_t', '', '', False), - ('float64', 'float64', 'float64_t', 'float64_t', '', '', True), - ('object', 'object', 'object', 'object', '', '', False)] - - -def get_dispatch(dtypes): - - inner_take_1d_template = """ - cdef: - Py_ssize_t i, n, idx - %(c_type_out)s fv - - n = indexer.shape[0] - - fv = fill_value - - %(nogil_str)s - %(tab)sfor i in range(n): - %(tab)s idx = indexer[i] - %(tab)s if idx == -1: - %(tab)s out[i] = fv - %(tab)s else: - %(tab)s out[i] = %(preval)svalues[idx]%(postval)s -""" - - inner_take_2d_axis0_template = """\ - cdef: - Py_ssize_t i, j, k, n, idx - %(c_type_out)s fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF %(can_copy)s: - cdef: - %(c_type_out)s *v - %(c_type_out)s *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(%(c_type_out)s) and - sizeof(%(c_type_out)s) * n >= 256): - - for i in range(n): - idx = indexer[i] - if idx == -1: - for j in range(k): - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(%(c_type_out)s) * k)) - return - - for i in range(n): - idx = indexer[i] - if idx == -1: - for j in range(k): - out[i, j] = fv - else: - for j in range(k): - out[i, j] = %(preval)svalues[idx, j]%(postval)s -""" - - inner_take_2d_axis1_template = """\ - cdef: - Py_ssize_t i, j, k, n, idx - %(c_type_out)s fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i in range(n): - for j in range(k): - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = %(preval)svalues[i, idx]%(postval)s -""" - - for (name, dest, c_type_in, c_type_out, preval, postval, - can_copy) in dtypes: - - nogil = c_type_out != "object" - if nogil: - nogil_str = "with nogil:" - tab = ' ' - else: - nogil_str = '' - tab = '' - - args = dict(name=name, dest=dest, c_type_in=c_type_in, - c_type_out=c_type_out, preval=preval, postval=postval, - can_copy=can_copy, nogil_str=nogil_str, tab=tab) - - inner_take_1d = inner_take_1d_template % args - inner_take_2d_axis0 = inner_take_2d_axis0_template % args - inner_take_2d_axis1 = inner_take_2d_axis1_template % args - - yield (name, dest, c_type_in, c_type_out, preval, postval, can_copy, - inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1) - -}} - - -{{for name, dest, c_type_in, c_type_out, preval, postval, can_copy, - inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1 - in get_dispatch(dtypes)}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_{{name}}_{{dest}}_memview({{c_type_in}}[:] values, - const int64_t[:] indexer, - {{c_type_out}}[:] out, - fill_value=np.nan): - - -{{inner_take_1d}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values, - const int64_t[:] indexer, - {{c_type_out}}[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_{{name}}_{{dest}}_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. -{{inner_take_1d}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values, - const int64_t[:] indexer, - {{c_type_out}}[:, :] out, - fill_value=np.nan): -{{inner_take_2d_axis0}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, - ndarray[int64_t] indexer, - {{c_type_out}}[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_{{name}}_{{dest}}_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. -{{inner_take_2d_axis0}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values, - const int64_t[:] indexer, - {{c_type_out}}[:, :] out, - fill_value=np.nan): -{{inner_take_2d_axis1}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, - ndarray[int64_t] indexer, - {{c_type_out}}[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_{{name}}_{{dest}}_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. -{{inner_take_2d_axis1}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, - indexer, - ndarray[{{c_type_out}}, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - {{c_type_out}} fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i in range(n): - idx = idx0[i] - if idx == -1: - for j in range(k): - out[i, j] = fv - else: - for j in range(k): - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = {{preval}}values[idx, idx1[j]]{{postval}} - -{{endfor}} - -# ---------------------------------------------------------------------- -# take_2d internal function -# ---------------------------------------------------------------------- - -ctypedef fused take_t: - float64_t - uint64_t - int64_t - object - - -cdef _take_2d(ndarray[take_t, ndim=2] values, object idx): - cdef: - Py_ssize_t i, j, N, K - ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx - ndarray[take_t, ndim=2] result - - N, K = (values).shape - - if take_t is object: - # evaluated at compile-time - result = values.copy() - else: - result = np.empty_like(values) - - for i in range(N): - for j in range(K): - result[i, j] = values[i, indexer[i, j]] - return result - - -_take_2d_object = _take_2d[object] -_take_2d_float64 = _take_2d[float64_t] -_take_2d_int64 = _take_2d[int64_t] -_take_2d_uint64 = _take_2d[uint64_t] diff --git a/setup.py b/setup.py index 2892cd0b2e294..0dd1980088db8 100755 --- a/setup.py +++ b/setup.py @@ -86,7 +86,6 @@ def is_platform_mac(): "algos": [ "_libs/algos_common_helper.pxi.in", "_libs/algos_take_helper.pxi.in", - "_libs/algos_rank_helper.pxi.in", ], "hashtable": [ "_libs/hashtable_class_helper.pxi.in", From 37a8666b3b60d55b14c7785ee9f6a068afe644b3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 17:51:50 -0700 Subject: [PATCH 2/3] FIXUP: revert accidental deletion --- pandas/_libs/algos_take_helper.pxi.in | 299 ++++++++++++++++++++++++++ 1 file changed, 299 insertions(+) create mode 100644 pandas/_libs/algos_take_helper.pxi.in diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in new file mode 100644 index 0000000000000..e7ee212065c5b --- /dev/null +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -0,0 +1,299 @@ +""" +Template for each `dtype` helper function for take + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +# ---------------------------------------------------------------------- +# take_1d, take_2d +# ---------------------------------------------------------------------- + +{{py: + +# name, dest, c_type_in, c_type_out, preval, postval, can_copy, nogil +dtypes = [ + ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True), + ('bool', 'object', 'uint8_t', 'object', + 'True if ', ' > 0 else False', False), + ('int8', 'int8', 'int8_t', 'int8_t', '', '', True), + ('int8', 'int32', 'int8_t', 'int32_t', '', '', False), + ('int8', 'int64', 'int8_t', 'int64_t', '', '', False), + ('int8', 'float64', 'int8_t', 'float64_t', '', '', False), + ('int16', 'int16', 'int16_t', 'int16_t', '', '', True), + ('int16', 'int32', 'int16_t', 'int32_t', '', '', False), + ('int16', 'int64', 'int16_t', 'int64_t', '', '', False), + ('int16', 'float64', 'int16_t', 'float64_t', '', '', False), + ('int32', 'int32', 'int32_t', 'int32_t', '', '', True), + ('int32', 'int64', 'int32_t', 'int64_t', '', '', False), + ('int32', 'float64', 'int32_t', 'float64_t', '', '', False), + ('int64', 'int64', 'int64_t', 'int64_t', '', '', True), + ('int64', 'float64', 'int64_t', 'float64_t', '', '', False), + ('float32', 'float32', 'float32_t', 'float32_t', '', '', True), + ('float32', 'float64', 'float32_t', 'float64_t', '', '', False), + ('float64', 'float64', 'float64_t', 'float64_t', '', '', True), + ('object', 'object', 'object', 'object', '', '', False)] + + +def get_dispatch(dtypes): + + inner_take_1d_template = """ + cdef: + Py_ssize_t i, n, idx + %(c_type_out)s fv + + n = indexer.shape[0] + + fv = fill_value + + %(nogil_str)s + %(tab)sfor i in range(n): + %(tab)s idx = indexer[i] + %(tab)s if idx == -1: + %(tab)s out[i] = fv + %(tab)s else: + %(tab)s out[i] = %(preval)svalues[idx]%(postval)s +""" + + inner_take_2d_axis0_template = """\ + cdef: + Py_ssize_t i, j, k, n, idx + %(c_type_out)s fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF %(can_copy)s: + cdef: + %(c_type_out)s *v + %(c_type_out)s *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(%(c_type_out)s) and + sizeof(%(c_type_out)s) * n >= 256): + + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(%(c_type_out)s) * k)) + return + + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + out[i, j] = fv + else: + for j in range(k): + out[i, j] = %(preval)svalues[idx, j]%(postval)s +""" + + inner_take_2d_axis1_template = """\ + cdef: + Py_ssize_t i, j, k, n, idx + %(c_type_out)s fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i in range(n): + for j in range(k): + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = %(preval)svalues[i, idx]%(postval)s +""" + + for (name, dest, c_type_in, c_type_out, preval, postval, + can_copy) in dtypes: + + nogil = c_type_out != "object" + if nogil: + nogil_str = "with nogil:" + tab = ' ' + else: + nogil_str = '' + tab = '' + + args = dict(name=name, dest=dest, c_type_in=c_type_in, + c_type_out=c_type_out, preval=preval, postval=postval, + can_copy=can_copy, nogil_str=nogil_str, tab=tab) + + inner_take_1d = inner_take_1d_template % args + inner_take_2d_axis0 = inner_take_2d_axis0_template % args + inner_take_2d_axis1 = inner_take_2d_axis1_template % args + + yield (name, dest, c_type_in, c_type_out, preval, postval, can_copy, + inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1) + +}} + + +{{for name, dest, c_type_in, c_type_out, preval, postval, can_copy, + inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1 + in get_dispatch(dtypes)}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_{{name}}_{{dest}}_memview({{c_type_in}}[:] values, + const int64_t[:] indexer, + {{c_type_out}}[:] out, + fill_value=np.nan): + + +{{inner_take_1d}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values, + const int64_t[:] indexer, + {{c_type_out}}[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_{{name}}_{{dest}}_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. +{{inner_take_1d}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values, + const int64_t[:] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): +{{inner_take_2d_axis0}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, + ndarray[int64_t] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_{{name}}_{{dest}}_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. +{{inner_take_2d_axis0}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values, + const int64_t[:] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): +{{inner_take_2d_axis1}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, + ndarray[int64_t] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_{{name}}_{{dest}}_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. +{{inner_take_2d_axis1}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, + indexer, + ndarray[{{c_type_out}}, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + {{c_type_out}} fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + out[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = {{preval}}values[idx, idx1[j]]{{postval}} + +{{endfor}} + +# ---------------------------------------------------------------------- +# take_2d internal function +# ---------------------------------------------------------------------- + +ctypedef fused take_t: + float64_t + uint64_t + int64_t + object + + +cdef _take_2d(ndarray[take_t, ndim=2] values, object idx): + cdef: + Py_ssize_t i, j, N, K + ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx + ndarray[take_t, ndim=2] result + + N, K = (values).shape + + if take_t is object: + # evaluated at compile-time + result = values.copy() + else: + result = np.empty_like(values) + + for i in range(N): + for j in range(K): + result[i, j] = values[i, indexer[i, j]] + return result + + +_take_2d_object = _take_2d[object] +_take_2d_float64 = _take_2d[float64_t] +_take_2d_int64 = _take_2d[int64_t] +_take_2d_uint64 = _take_2d[uint64_t] From 1ef7732c130ec4bccb879de16665a0ad4d1e48bb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 18:24:49 -0700 Subject: [PATCH 3/3] lint fixup --- pandas/_libs/algos.pyx | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a4980633b5d1c..cab8bc8e799d4 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -900,13 +900,17 @@ def rank_1d(rank_t[:] in_arr, ties_method='average', count += 1.0 if rank_t is object: - condition = (i == n - 1 or + condition = ( + i == n - 1 or are_diff(sorted_data[i + 1], val) or - i == non_na_idx) + i == non_na_idx + ) else: - condition = (i == n - 1 or + condition = ( + i == n - 1 or sorted_data[i + 1] != val or - i == non_na_idx) + i == non_na_idx + ) if condition: @@ -953,13 +957,17 @@ def rank_1d(rank_t[:] in_arr, ties_method='average', count += 1.0 if rank_t is object: - condition = (i == n - 1 or + condition = ( + i == n - 1 or are_diff(sorted_data[i + 1], val) or - i == non_na_idx) + i == non_na_idx + ) else: - condition = (i == n - 1 or + condition = ( + i == n - 1 or sorted_data[i + 1] != val or - i == non_na_idx) + i == non_na_idx + ) if condition: