Skip to content

REF: libhashtable.mode support mask #46880

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion pandas/_libs/hashtable.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,13 @@ def duplicated(
values: np.ndarray,
keep: Literal["last", "first", False] = ...,
) -> npt.NDArray[np.bool_]: ...
def mode(values: np.ndarray, dropna: bool) -> np.ndarray: ...
def mode(
values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None
) -> np.ndarray: ...
def value_count(
values: np.ndarray,
dropna: bool,
mask: npt.NDArray[np.bool_] | None = None,
) -> tuple[np.ndarray, npt.NDArray[np.int64],]: ... # np.ndarray[same-as-values]

# arr and values should have same dtype
Expand Down
52 changes: 33 additions & 19 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ dtypes = [('Complex128', 'complex128', 'complex128',
@cython.wraparound(False)
@cython.boundscheck(False)
{{if dtype == 'object'}}
cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t[:] mask=None):
{{else}}
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None):
{{endif}}
cdef:
Py_ssize_t i = 0
Expand All @@ -46,6 +46,11 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
{{c_type}} val

int ret = 0
bint uses_mask = mask is not None
bint isna_entry = False

if uses_mask and not dropna:
raise NotImplementedError("uses_mask not implemented with dropna=False")

# we track the order in which keys are first seen (GH39009),
# khash-map isn't insertion-ordered, thus:
Expand All @@ -56,6 +61,9 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
table = kh_init_{{ttype}}()

{{if dtype == 'object'}}
if uses_mask:
raise NotImplementedError("uses_mask not implemented with object dtype")

kh_resize_{{ttype}}(table, n // 10)

for i in range(n):
Expand All @@ -74,7 +82,13 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
for i in range(n):
val = {{to_c_type}}(values[i])

if not is_nan_{{c_type}}(val) or not dropna:
if dropna:
if uses_mask:
isna_entry = mask[i]
else:
isna_entry = is_nan_{{c_type}}(val)

if not dropna or not isna_entry:
k = kh_get_{{ttype}}(table, val)
if k != table.n_buckets:
table.vals[k] += 1
Expand Down Expand Up @@ -251,37 +265,37 @@ ctypedef fused htfunc_t:
complex64_t


cpdef value_count(ndarray[htfunc_t] values, bint dropna):
cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
if htfunc_t is object:
return value_count_object(values, dropna)
return value_count_object(values, dropna, mask=mask)

elif htfunc_t is int8_t:
return value_count_int8(values, dropna)
return value_count_int8(values, dropna, mask=mask)
elif htfunc_t is int16_t:
return value_count_int16(values, dropna)
return value_count_int16(values, dropna, mask=mask)
elif htfunc_t is int32_t:
return value_count_int32(values, dropna)
return value_count_int32(values, dropna, mask=mask)
elif htfunc_t is int64_t:
return value_count_int64(values, dropna)
return value_count_int64(values, dropna, mask=mask)

elif htfunc_t is uint8_t:
return value_count_uint8(values, dropna)
return value_count_uint8(values, dropna, mask=mask)
elif htfunc_t is uint16_t:
return value_count_uint16(values, dropna)
return value_count_uint16(values, dropna, mask=mask)
elif htfunc_t is uint32_t:
return value_count_uint32(values, dropna)
return value_count_uint32(values, dropna, mask=mask)
elif htfunc_t is uint64_t:
return value_count_uint64(values, dropna)
return value_count_uint64(values, dropna, mask=mask)

elif htfunc_t is float64_t:
return value_count_float64(values, dropna)
return value_count_float64(values, dropna, mask=mask)
elif htfunc_t is float32_t:
return value_count_float32(values, dropna)
return value_count_float32(values, dropna, mask=mask)

elif htfunc_t is complex128_t:
return value_count_complex128(values, dropna)
return value_count_complex128(values, dropna, mask=mask)
elif htfunc_t is complex64_t:
return value_count_complex64(values, dropna)
return value_count_complex64(values, dropna, mask=mask)

else:
raise TypeError(values.dtype)
Expand Down Expand Up @@ -361,7 +375,7 @@ cpdef ismember(ndarray[htfunc_t] arr, ndarray[htfunc_t] values):

@cython.wraparound(False)
@cython.boundscheck(False)
def mode(ndarray[htfunc_t] values, bint dropna):
def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
# TODO(cython3): use const htfunct_t[:]

cdef:
Expand All @@ -372,7 +386,7 @@ def mode(ndarray[htfunc_t] values, bint dropna):
int64_t count, max_count = -1
Py_ssize_t nkeys, k, j = 0

keys, counts = value_count(values, dropna)
keys, counts = value_count(values, dropna, mask=mask)
nkeys = len(keys)

modes = np.empty(nkeys, dtype=values.dtype)
Expand Down
13 changes: 9 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -858,12 +858,15 @@ def value_counts(


# Called once from SparseArray, otherwise could be private
def value_counts_arraylike(values: np.ndarray, dropna: bool):
def value_counts_arraylike(
values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None
):
"""
Parameters
----------
values : np.ndarray
dropna : bool
mask : np.ndarray[bool] or None, default None

Returns
-------
Expand All @@ -873,7 +876,7 @@ def value_counts_arraylike(values: np.ndarray, dropna: bool):
original = values
values = _ensure_data(values)

keys, counts = htable.value_count(values, dropna)
keys, counts = htable.value_count(values, dropna, mask=mask)

if needs_i8_conversion(original.dtype):
# datetime, timedelta, or period
Expand Down Expand Up @@ -911,7 +914,9 @@ def duplicated(
return htable.duplicated(values, keep=keep)


def mode(values: ArrayLike, dropna: bool = True) -> ArrayLike:
def mode(
values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None
) -> ArrayLike:
"""
Returns the mode(s) of an array.

Expand All @@ -937,7 +942,7 @@ def mode(values: ArrayLike, dropna: bool = True) -> ArrayLike:

values = _ensure_data(values)

npresult = htable.mode(values, dropna=dropna)
npresult = htable.mode(values, dropna=dropna, mask=mask)
try:
npresult = np.sort(npresult)
except TypeError as err:
Expand Down
14 changes: 7 additions & 7 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
from pandas._libs import (
NaT,
algos as libalgos,
hashtable as htable,
lib,
)
from pandas._libs.arrays import NDArrayBacked
Expand Down Expand Up @@ -2255,14 +2254,15 @@ def mode(self, dropna: bool = True) -> Categorical:

def _mode(self, dropna: bool = True) -> Categorical:
codes = self._codes
mask = None
if dropna:
good = self._codes != -1
codes = self._codes[good]
mask = self.isna()

codes = htable.mode(codes, dropna)
codes.sort()
codes = coerce_indexer_dtype(codes, self.dtype.categories)
return self._from_backing_data(codes)
res_codes = algorithms.mode(codes, mask=mask)
res_codes = cast(np.ndarray, res_codes)
assert res_codes.dtype == codes.dtype
res = self._from_backing_data(res_codes)
return res

# ------------------------------------------------------------------
# ExtensionArray Interface
Expand Down
10 changes: 10 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
)

from pandas.core import (
algorithms as algos,
arraylike,
missing,
nanops,
Expand Down Expand Up @@ -907,6 +908,15 @@ def value_counts(self, dropna: bool = True) -> Series:
)
from pandas.arrays import IntegerArray

if dropna:
keys, counts = algos.value_counts_arraylike(
self._data, dropna=True, mask=self._mask
)
res = Series(counts, index=keys)
res.index = res.index.astype(self.dtype)
res = res.astype("Int64")
return res

# compute counts on the data with no nans
data = self._data[~self._mask]
value_counts = Index(data).value_counts()
Expand Down