From 7689f82aa4b75ebe82f4cdb95d99affc13583b2a Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 26 Apr 2022 16:32:01 -0700 Subject: [PATCH 1/2] REF: libhashtable.mode support mask --- pandas/_libs/hashtable.pyi | 5 ++- pandas/_libs/hashtable_func_helper.pxi.in | 52 ++++++++++++++--------- pandas/core/algorithms.py | 13 ++++-- pandas/core/arrays/categorical.py | 13 +++--- pandas/core/arrays/masked.py | 10 +++++ 5 files changed, 62 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 481ff0d36c460..5c7be5e660fd9 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -197,10 +197,13 @@ def duplicated( values: np.ndarray, keep: Literal["last", "first", False] = ..., ) -> npt.NDArray[np.bool_]: ... -def mode(values: np.ndarray, dropna: bool) -> np.ndarray: ... +def mode( + values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None +) -> np.ndarray: ... def value_count( values: np.ndarray, dropna: bool, + mask: npt.NDArray[np.bool_] | None = None, ) -> tuple[np.ndarray, npt.NDArray[np.int64],]: ... # np.ndarray[same-as-values] # arr and values should have same dtype diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 11a45bb194c03..f7c41b32864be 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -31,9 +31,9 @@ dtypes = [('Complex128', 'complex128', 'complex128', @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): +cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t[:] mask=None): {{else}} -cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): +cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None): {{endif}} cdef: Py_ssize_t i = 0 @@ -46,6 +46,11 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{c_type}} val int ret = 0 + bint uses_mask = mask is not None + bint isna_entry = False + + if uses_mask and not dropna: + raise NotImplementedError("uses_mask not implemented with dropna=False") # we track the order in which keys are first seen (GH39009), # khash-map isn't insertion-ordered, thus: @@ -56,6 +61,9 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): table = kh_init_{{ttype}}() {{if dtype == 'object'}} + if uses_mask: + raise NotImplementedError("uses_mask not implemented with object dtype") + kh_resize_{{ttype}}(table, n // 10) for i in range(n): @@ -74,7 +82,13 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): for i in range(n): val = {{to_c_type}}(values[i]) - if not is_nan_{{c_type}}(val) or not dropna: + if dropna: + if uses_mask: + isna_entry = mask[i] + else: + isna_entry = is_nan_{{c_type}}(val) + + if not dropna or not isna_entry: k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: table.vals[k] += 1 @@ -251,37 +265,37 @@ ctypedef fused htfunc_t: complex64_t -cpdef value_count(ndarray[htfunc_t] values, bint dropna): +cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): if htfunc_t is object: - return value_count_object(values, dropna) + return value_count_object(values, dropna, mask=mask) elif htfunc_t is int8_t: - return value_count_int8(values, dropna) + return value_count_int8(values, dropna, mask=mask) elif htfunc_t is int16_t: - return value_count_int16(values, dropna) + return value_count_int16(values, dropna, mask=mask) elif htfunc_t is int32_t: - return value_count_int32(values, dropna) + return value_count_int32(values, dropna, mask=mask) elif htfunc_t is int64_t: - return value_count_int64(values, dropna) + return value_count_int64(values, dropna, mask=mask) elif htfunc_t is uint8_t: - return value_count_uint8(values, dropna) + return value_count_uint8(values, dropna, mask=mask) elif htfunc_t is uint16_t: - return value_count_uint16(values, dropna) + return value_count_uint16(values, dropna, mask=mask) elif htfunc_t is uint32_t: - return value_count_uint32(values, dropna) + return value_count_uint32(values, dropna, mask=mask) elif htfunc_t is uint64_t: - return value_count_uint64(values, dropna) + return value_count_uint64(values, dropna, mask=mask) elif htfunc_t is float64_t: - return value_count_float64(values, dropna) + return value_count_float64(values, dropna, mask=mask) elif htfunc_t is float32_t: - return value_count_float32(values, dropna) + return value_count_float32(values, dropna, mask=mask) elif htfunc_t is complex128_t: - return value_count_complex128(values, dropna) + return value_count_complex128(values, dropna, mask=mask) elif htfunc_t is complex64_t: - return value_count_complex64(values, dropna) + return value_count_complex64(values, dropna, mask=mask) else: raise TypeError(values.dtype) @@ -361,7 +375,7 @@ cpdef ismember(ndarray[htfunc_t] arr, ndarray[htfunc_t] values): @cython.wraparound(False) @cython.boundscheck(False) -def mode(ndarray[htfunc_t] values, bint dropna): +def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): # TODO(cython3): use const htfunct_t[:] cdef: @@ -372,7 +386,7 @@ def mode(ndarray[htfunc_t] values, bint dropna): int64_t count, max_count = -1 Py_ssize_t nkeys, k, j = 0 - keys, counts = value_count(values, dropna) + keys, counts = value_count(values, dropna, mask=mask) nkeys = len(keys) modes = np.empty(nkeys, dtype=values.dtype) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0c0b93f41c657..112c401500472 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -858,12 +858,15 @@ def value_counts( # Called once from SparseArray, otherwise could be private -def value_counts_arraylike(values: np.ndarray, dropna: bool): +def value_counts_arraylike( + values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None +): """ Parameters ---------- values : np.ndarray dropna : bool + mask : np.ndarray[bool] or None, default None Returns ------- @@ -873,7 +876,7 @@ def value_counts_arraylike(values: np.ndarray, dropna: bool): original = values values = _ensure_data(values) - keys, counts = htable.value_count(values, dropna) + keys, counts = htable.value_count(values, dropna, mask=mask) if needs_i8_conversion(original.dtype): # datetime, timedelta, or period @@ -911,7 +914,9 @@ def duplicated( return htable.duplicated(values, keep=keep) -def mode(values: ArrayLike, dropna: bool = True) -> ArrayLike: +def mode( + values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None +) -> ArrayLike: """ Returns the mode(s) of an array. @@ -937,7 +942,7 @@ def mode(values: ArrayLike, dropna: bool = True) -> ArrayLike: values = _ensure_data(values) - npresult = htable.mode(values, dropna=dropna) + npresult = htable.mode(values, dropna=dropna, mask=mask) try: npresult = np.sort(npresult) except TypeError as err: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index eca7a205983ef..a2ee03bac40b4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -26,7 +26,6 @@ from pandas._libs import ( NaT, algos as libalgos, - hashtable as htable, lib, ) from pandas._libs.arrays import NDArrayBacked @@ -2255,14 +2254,14 @@ def mode(self, dropna: bool = True) -> Categorical: def _mode(self, dropna: bool = True) -> Categorical: codes = self._codes + mask = None if dropna: - good = self._codes != -1 - codes = self._codes[good] + mask = self.isna() - codes = htable.mode(codes, dropna) - codes.sort() - codes = coerce_indexer_dtype(codes, self.dtype.categories) - return self._from_backing_data(codes) + res_codes = algorithms.mode(codes, mask=mask) + assert res_codes.dtype == codes.dtype + res = self._from_backing_data(res_codes) + return res # ------------------------------------------------------------------ # ExtensionArray Interface diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 95363e598a06c..5ae71b305ac60 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -57,6 +57,7 @@ ) from pandas.core import ( + algorithms as algos, arraylike, missing, nanops, @@ -907,6 +908,15 @@ def value_counts(self, dropna: bool = True) -> Series: ) from pandas.arrays import IntegerArray + if dropna: + keys, counts = algos.value_counts_arraylike( + self._data, dropna=True, mask=self._mask + ) + res = Series(counts, index=keys) + res.index = res.index.astype(self.dtype) + res = res.astype("Int64") + return res + # compute counts on the data with no nans data = self._data[~self._mask] value_counts = Index(data).value_counts() From 007cc23085c5af671facc0e09b99f3131f45e951 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 26 Apr 2022 18:38:46 -0700 Subject: [PATCH 2/2] mypy fixup --- pandas/core/arrays/categorical.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a2ee03bac40b4..01a04b7aa63d9 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2259,6 +2259,7 @@ def _mode(self, dropna: bool = True) -> Categorical: mask = self.isna() res_codes = algorithms.mode(codes, mask=mask) + res_codes = cast(np.ndarray, res_codes) assert res_codes.dtype == codes.dtype res = self._from_backing_data(res_codes) return res