From f0b08f68201ffc7246db244326e41fdaf6590cbc Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 2 Apr 2021 20:50:15 -0700 Subject: [PATCH 1/2] CLN/TYP: _libs --- pandas/_libs/groupby.pyx | 51 +++++++++++----------- pandas/_libs/hashtable.pxd | 2 +- pandas/_libs/hashtable.pyx | 6 +-- pandas/_libs/hashtable_class_helper.pxi.in | 27 +++++++----- pandas/_libs/index.pyx | 3 +- pandas/_libs/index_class_helper.pxi.in | 3 +- pandas/_libs/internals.pyx | 4 +- pandas/_libs/tslibs/fields.pyx | 6 +-- 8 files changed, 54 insertions(+), 48 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e7cd7cd898d5b..48ee01c809efd 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -106,7 +106,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float64_t, ndim=2] values, ndarray[intp_t] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=-1) -> None: """ Only aggregates on axis=0 """ @@ -148,7 +148,7 @@ def group_cumprod_float64(float64_t[:, ::1] out, const intp_t[:] labels, int ngroups, bint is_datetimelike, - bint skipna=True): + bint skipna=True) -> None: """ Cumulative product of columns of `values`, in row groups `labels`. @@ -205,7 +205,7 @@ def group_cumsum(numeric[:, ::1] out, const intp_t[:] labels, int ngroups, is_datetimelike, - bint skipna=True): + bint skipna=True) -> None: """ Cumulative sum of columns of `values`, in row groups `labels`. @@ -270,7 +270,7 @@ def group_cumsum(numeric[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) def group_shift_indexer(int64_t[::1] out, const intp_t[:] labels, - int ngroups, int periods): + int ngroups, int periods) -> None: cdef: Py_ssize_t N, i, j, ii, lab int offset = 0, sign @@ -322,14 +322,14 @@ def group_shift_indexer(int64_t[::1] out, const intp_t[:] labels, @cython.wraparound(False) @cython.boundscheck(False) def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels, - ndarray[uint8_t] mask, object direction, - int64_t limit, bint dropna): + ndarray[uint8_t] mask, str direction, + int64_t limit, bint dropna) -> None: """ Indexes how to fill values forwards or backwards within a group. Parameters ---------- - out : np.ndarray[np.uint8] + out : np.ndarray[np.int64] Values into which this method will write its results. labels : np.ndarray[np.intp] Array containing unique label for each group, with its ordering @@ -392,8 +392,8 @@ def group_any_all(uint8_t[::1] out, const uint8_t[::1] values, const intp_t[:] labels, const uint8_t[::1] mask, - object val_test, - bint skipna): + str val_test, + bint skipna) -> None: """ Aggregated boolean values to show truthfulness of group elements. @@ -465,7 +465,7 @@ def group_add(complexfloating_t[:, ::1] out, int64_t[::1] counts, ndarray[complexfloating_t, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=0): + Py_ssize_t min_count=0) -> None: """ Only aggregates on axis=0 using Kahan summation """ @@ -518,7 +518,7 @@ def group_prod(floating[:, ::1] out, int64_t[::1] counts, ndarray[floating, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=0): + Py_ssize_t min_count=0) -> None: """ Only aggregates on axis=0 """ @@ -568,7 +568,7 @@ def group_var(floating[:, ::1] out, ndarray[floating, ndim=2] values, const intp_t[:] labels, Py_ssize_t min_count=-1, - int64_t ddof=1): + int64_t ddof=1) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) floating val, ct, oldmean @@ -621,7 +621,7 @@ def group_mean(floating[:, ::1] out, int64_t[::1] counts, ndarray[floating, ndim=2] values, const intp_t[::1] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=-1) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) floating val, count, y, t @@ -673,7 +673,7 @@ def group_ohlc(floating[:, ::1] out, int64_t[::1] counts, ndarray[floating, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=-1) -> None: """ Only aggregates on axis=0 """ @@ -721,7 +721,7 @@ def group_quantile(ndarray[float64_t] out, ndarray[intp_t] labels, ndarray[uint8_t] mask, float64_t q, - object interpolation): + str interpolation) -> None: """ Calculate the quantile per group. @@ -733,8 +733,6 @@ def group_quantile(ndarray[float64_t] out, Array containing the values to apply the function against. labels : ndarray[np.intp] Array containing the unique group labels. - values : ndarray - Array containing the values to apply the function against. q : float The quantile value to search for. interpolation : {'linear', 'lower', 'highest', 'nearest', 'midpoint'} @@ -865,7 +863,7 @@ def group_last(rank_t[:, ::1] out, int64_t[::1] counts, ndarray[rank_t, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=-1) -> None: """ Only aggregates on axis=0 """ @@ -957,8 +955,9 @@ def group_nth(rank_t[:, ::1] out, int64_t[::1] counts, ndarray[rank_t, ndim=2] values, const intp_t[:] labels, - int64_t min_count=-1, int64_t rank=1 - ): + int64_t min_count=-1, + int64_t rank=1, + ) -> None: """ Only aggregates on axis=0 """ @@ -1050,8 +1049,8 @@ def group_rank(float64_t[:, ::1] out, ndarray[rank_t, ndim=2] values, const intp_t[:] labels, int ngroups, - bint is_datetimelike, object ties_method="average", - bint ascending=True, bint pct=False, object na_option="keep"): + bint is_datetimelike, str ties_method="average", + bint ascending=True, bint pct=False, str na_option="keep") -> None: """ Provides the rank of values within each group. @@ -1221,7 +1220,7 @@ def group_max(groupby_t[:, ::1] out, int64_t[::1] counts, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=-1) -> None: """See group_min_max.__doc__""" group_min_max(out, counts, values, labels, min_count=min_count, compute_max=True) @@ -1232,7 +1231,7 @@ def group_min(groupby_t[:, ::1] out, int64_t[::1] counts, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=-1) -> None: """See group_min_max.__doc__""" group_min_max(out, counts, values, labels, min_count=min_count, compute_max=False) @@ -1311,7 +1310,7 @@ def group_cummin(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, int ngroups, - bint is_datetimelike): + bint is_datetimelike) -> None: """See group_cummin_max.__doc__""" group_cummin_max(out, values, labels, ngroups, is_datetimelike, compute_max=False) @@ -1322,6 +1321,6 @@ def group_cummax(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, int ngroups, - bint is_datetimelike): + bint is_datetimelike) -> None: """See group_cummin_max.__doc__""" group_cummin_max(out, values, labels, ngroups, is_datetimelike, compute_max=True) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 735d8c07f4774..a5679af44ac06 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -134,6 +134,6 @@ cdef class Int64Vector: cdef bint external_view_exists cdef resize(self) - cpdef to_array(self) + cpdef ndarray to_array(self) cdef inline void append(self, int64_t x) cdef extend(self, int64_t[:] x) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index e402a4b7c0ccc..1e2a336f12444 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -61,7 +61,7 @@ cdef class Factorizer: ObjectVector uniques Py_ssize_t count - def __init__(self, size_hint): + def __init__(self, size_hint: int): self.table = PyObjectHashTable(size_hint) self.uniques = ObjectVector() self.count = 0 @@ -116,12 +116,12 @@ cdef class Int64Factorizer: Int64Vector uniques Py_ssize_t count - def __init__(self, size_hint): + def __init__(self, size_hint: int): self.table = Int64HashTable(size_hint) self.uniques = Int64Vector() self.count = 0 - def get_count(self): + def get_count(self) -> int: return self.count def factorize(self, const int64_t[:] values, sort=False, diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 301644274111b..b80a127be970d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -220,7 +220,7 @@ cdef class {{name}}Vector: def __len__(self) -> int: return self.data.n - cpdef to_array(self): + cpdef ndarray to_array(self): if self.data.m != self.data.n: if self.external_view_exists: # should never happen @@ -288,7 +288,7 @@ cdef class StringVector: def __len__(self) -> int: return self.data.n - def to_array(self): + cpdef ndarray[object, ndim=1] to_array(self): cdef: ndarray ao Py_ssize_t n @@ -345,7 +345,7 @@ cdef class ObjectVector: self.data[self.n] = obj self.n += 1 - def to_array(self): + cpdef ndarray[object, ndim=1] to_array(self): if self.m != self.n: if self.external_view_exists: raise ValueError("should have raised on append()") @@ -403,7 +403,7 @@ cdef class {{name}}HashTable(HashTable): kh_destroy_{{dtype}}(self.table) self.table = NULL - def __contains__(self, object key): + def __contains__(self, object key) -> bool: cdef: khiter_t k {{c_type}} ckey @@ -452,7 +452,7 @@ cdef class {{name}}HashTable(HashTable): raise KeyError(key) @cython.boundscheck(False) - def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values): + def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values) -> None: cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -466,7 +466,7 @@ cdef class {{name}}HashTable(HashTable): self.table.vals[k] = values[i] @cython.boundscheck(False) - def map_locations(self, const {{dtype}}_t[:] values): + def map_locations(self, const {{dtype}}_t[:] values) -> None: cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -480,7 +480,8 @@ cdef class {{name}}HashTable(HashTable): self.table.vals[k] = i @cython.boundscheck(False) - def lookup(self, const {{dtype}}_t[:] values): + def lookup(self, const {{dtype}}_t[:] values) -> ndarray: + # -> np.ndarray[np.intp] cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -818,7 +819,8 @@ cdef class StringHashTable(HashTable): return labels @cython.boundscheck(False) - def lookup(self, ndarray[object] values): + def lookup(self, ndarray[object] values) -> ndarray: + # -> np.ndarray[np.intp] cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -853,7 +855,7 @@ cdef class StringHashTable(HashTable): return np.asarray(locs) @cython.boundscheck(False) - def map_locations(self, ndarray[object] values): + def map_locations(self, ndarray[object] values) -> None: cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -1071,7 +1073,7 @@ cdef class PyObjectHashTable(HashTable): def __len__(self) -> int: return self.table.size - def __contains__(self, object key): + def __contains__(self, object key) -> bool: cdef: khiter_t k hash(key) @@ -1123,7 +1125,7 @@ cdef class PyObjectHashTable(HashTable): else: raise KeyError(key) - def map_locations(self, ndarray[object] values): + def map_locations(self, ndarray[object] values) -> None: cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -1137,7 +1139,8 @@ cdef class PyObjectHashTable(HashTable): k = kh_put_pymap(self.table, val, &ret) self.table.vals[k] = i - def lookup(self, ndarray[object] values): + def lookup(self, ndarray[object] values) -> ndarray: + # -> np.ndarray[np.intp] cdef: Py_ssize_t i, n = len(values) int ret = 0 diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 47e6d417bb925..f1f56c6c0c855 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -132,6 +132,7 @@ cdef class IndexEngine: return self._maybe_get_bool_indexer(val) cdef _maybe_get_bool_indexer(self, object val): + # Returns ndarray[bool] or int cdef: ndarray[uint8_t, ndim=1, cast=True] indexer @@ -247,7 +248,7 @@ cdef class IndexEngine: self.need_unique_check = 0 - cdef void _call_map_locations(self, values): + cdef void _call_map_locations(self, ndarray values): self.mapping.map_locations(values) def clear_mapping(self): diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index e5026ce2fa292..8638c2c689c3f 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -44,10 +44,11 @@ cdef class {{name}}Engine(IndexEngine): raise KeyError(val) {{endif}} - cdef void _call_map_locations(self, values): + cdef void _call_map_locations(self, ndarray values): self.mapping.map_locations(algos.ensure_{{name.lower()}}(values)) cdef _maybe_get_bool_indexer(self, object val): + # Returns ndarray[bool] or int cdef: ndarray[uint8_t, ndim=1, cast=True] indexer ndarray[intp_t, ndim=1] found diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 4295db9d1613c..346930a51b8b3 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -372,7 +372,9 @@ cdef slice indexer_as_slice(intp_t[:] vals): @cython.boundscheck(False) @cython.wraparound(False) -def get_blkno_indexers(int64_t[:] blknos, bint group=True): +def get_blkno_indexers( + int64_t[:] blknos, bint group=True +) -> list[tuple[int, slice | np.ndarray]]: """ Enumerate contiguous runs of integers in ndarray. diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index d6ca38e57d2d8..a679527434dd3 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -93,7 +93,7 @@ def build_field_sarray(const int64_t[:] dtindex): return out -def month_position_check(fields, weekdays): +def month_position_check(fields, weekdays) -> str | None: cdef: int32_t daysinmonth, y, m, d bint calendar_end = True @@ -198,7 +198,7 @@ cdef inline bint _is_on_month(int month, int compare_month, int modby) nogil: @cython.wraparound(False) @cython.boundscheck(False) def get_start_end_field(const int64_t[:] dtindex, str field, - object freqstr=None, int month_kw=12): + str freqstr=None, int month_kw=12): """ Given an int64-based datetime index return array of indicators of whether timestamps are at the start/end of the month/quarter/year @@ -755,7 +755,7 @@ cdef inline ndarray[int64_t] _roundup_int64(values, int64_t unit): return _floor_int64(values + unit // 2, unit) -def round_nsint64(values: np.ndarray, mode: RoundTo, nanos) -> np.ndarray: +def round_nsint64(values: np.ndarray, mode: RoundTo, nanos: int) -> np.ndarray: """ Applies rounding mode at given frequency From d49cddc23f6dcdf138951309bd02c794b359e36e Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 2 Apr 2021 21:04:17 -0700 Subject: [PATCH 2/2] revert to troubleshoot --- pandas/_libs/internals.pyx | 4 +--- pandas/_libs/tslibs/fields.pyx | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 346930a51b8b3..4295db9d1613c 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -372,9 +372,7 @@ cdef slice indexer_as_slice(intp_t[:] vals): @cython.boundscheck(False) @cython.wraparound(False) -def get_blkno_indexers( - int64_t[:] blknos, bint group=True -) -> list[tuple[int, slice | np.ndarray]]: +def get_blkno_indexers(int64_t[:] blknos, bint group=True): """ Enumerate contiguous runs of integers in ndarray. diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index a679527434dd3..d6ca38e57d2d8 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -93,7 +93,7 @@ def build_field_sarray(const int64_t[:] dtindex): return out -def month_position_check(fields, weekdays) -> str | None: +def month_position_check(fields, weekdays): cdef: int32_t daysinmonth, y, m, d bint calendar_end = True @@ -198,7 +198,7 @@ cdef inline bint _is_on_month(int month, int compare_month, int modby) nogil: @cython.wraparound(False) @cython.boundscheck(False) def get_start_end_field(const int64_t[:] dtindex, str field, - str freqstr=None, int month_kw=12): + object freqstr=None, int month_kw=12): """ Given an int64-based datetime index return array of indicators of whether timestamps are at the start/end of the month/quarter/year @@ -755,7 +755,7 @@ cdef inline ndarray[int64_t] _roundup_int64(values, int64_t unit): return _floor_int64(values + unit // 2, unit) -def round_nsint64(values: np.ndarray, mode: RoundTo, nanos: int) -> np.ndarray: +def round_nsint64(values: np.ndarray, mode: RoundTo, nanos) -> np.ndarray: """ Applies rounding mode at given frequency