Skip to content

Commit 619ce2d

Browse files
authored
REF: libhashtable.mode support mask (#46880)
1 parent 9ad8150 commit 619ce2d

File tree

5 files changed

+63
-31
lines changed

5 files changed

+63
-31
lines changed

pandas/_libs/hashtable.pyi

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,10 +197,13 @@ def duplicated(
197197
values: np.ndarray,
198198
keep: Literal["last", "first", False] = ...,
199199
) -> npt.NDArray[np.bool_]: ...
200-
def mode(values: np.ndarray, dropna: bool) -> np.ndarray: ...
200+
def mode(
201+
values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None
202+
) -> np.ndarray: ...
201203
def value_count(
202204
values: np.ndarray,
203205
dropna: bool,
206+
mask: npt.NDArray[np.bool_] | None = None,
204207
) -> tuple[np.ndarray, npt.NDArray[np.int64],]: ... # np.ndarray[same-as-values]
205208

206209
# arr and values should have same dtype

pandas/_libs/hashtable_func_helper.pxi.in

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ dtypes = [('Complex128', 'complex128', 'complex128',
3131
@cython.wraparound(False)
3232
@cython.boundscheck(False)
3333
{{if dtype == 'object'}}
34-
cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
34+
cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t[:] mask=None):
3535
{{else}}
36-
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
36+
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None):
3737
{{endif}}
3838
cdef:
3939
Py_ssize_t i = 0
@@ -46,6 +46,11 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
4646
{{c_type}} val
4747

4848
int ret = 0
49+
bint uses_mask = mask is not None
50+
bint isna_entry = False
51+
52+
if uses_mask and not dropna:
53+
raise NotImplementedError("uses_mask not implemented with dropna=False")
4954

5055
# we track the order in which keys are first seen (GH39009),
5156
# khash-map isn't insertion-ordered, thus:
@@ -56,6 +61,9 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
5661
table = kh_init_{{ttype}}()
5762

5863
{{if dtype == 'object'}}
64+
if uses_mask:
65+
raise NotImplementedError("uses_mask not implemented with object dtype")
66+
5967
kh_resize_{{ttype}}(table, n // 10)
6068

6169
for i in range(n):
@@ -74,7 +82,13 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
7482
for i in range(n):
7583
val = {{to_c_type}}(values[i])
7684

77-
if not is_nan_{{c_type}}(val) or not dropna:
85+
if dropna:
86+
if uses_mask:
87+
isna_entry = mask[i]
88+
else:
89+
isna_entry = is_nan_{{c_type}}(val)
90+
91+
if not dropna or not isna_entry:
7892
k = kh_get_{{ttype}}(table, val)
7993
if k != table.n_buckets:
8094
table.vals[k] += 1
@@ -251,37 +265,37 @@ ctypedef fused htfunc_t:
251265
complex64_t
252266

253267

254-
cpdef value_count(ndarray[htfunc_t] values, bint dropna):
268+
cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
255269
if htfunc_t is object:
256-
return value_count_object(values, dropna)
270+
return value_count_object(values, dropna, mask=mask)
257271

258272
elif htfunc_t is int8_t:
259-
return value_count_int8(values, dropna)
273+
return value_count_int8(values, dropna, mask=mask)
260274
elif htfunc_t is int16_t:
261-
return value_count_int16(values, dropna)
275+
return value_count_int16(values, dropna, mask=mask)
262276
elif htfunc_t is int32_t:
263-
return value_count_int32(values, dropna)
277+
return value_count_int32(values, dropna, mask=mask)
264278
elif htfunc_t is int64_t:
265-
return value_count_int64(values, dropna)
279+
return value_count_int64(values, dropna, mask=mask)
266280

267281
elif htfunc_t is uint8_t:
268-
return value_count_uint8(values, dropna)
282+
return value_count_uint8(values, dropna, mask=mask)
269283
elif htfunc_t is uint16_t:
270-
return value_count_uint16(values, dropna)
284+
return value_count_uint16(values, dropna, mask=mask)
271285
elif htfunc_t is uint32_t:
272-
return value_count_uint32(values, dropna)
286+
return value_count_uint32(values, dropna, mask=mask)
273287
elif htfunc_t is uint64_t:
274-
return value_count_uint64(values, dropna)
288+
return value_count_uint64(values, dropna, mask=mask)
275289

276290
elif htfunc_t is float64_t:
277-
return value_count_float64(values, dropna)
291+
return value_count_float64(values, dropna, mask=mask)
278292
elif htfunc_t is float32_t:
279-
return value_count_float32(values, dropna)
293+
return value_count_float32(values, dropna, mask=mask)
280294

281295
elif htfunc_t is complex128_t:
282-
return value_count_complex128(values, dropna)
296+
return value_count_complex128(values, dropna, mask=mask)
283297
elif htfunc_t is complex64_t:
284-
return value_count_complex64(values, dropna)
298+
return value_count_complex64(values, dropna, mask=mask)
285299

286300
else:
287301
raise TypeError(values.dtype)
@@ -361,7 +375,7 @@ cpdef ismember(ndarray[htfunc_t] arr, ndarray[htfunc_t] values):
361375

362376
@cython.wraparound(False)
363377
@cython.boundscheck(False)
364-
def mode(ndarray[htfunc_t] values, bint dropna):
378+
def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
365379
# TODO(cython3): use const htfunct_t[:]
366380

367381
cdef:
@@ -372,7 +386,7 @@ def mode(ndarray[htfunc_t] values, bint dropna):
372386
int64_t count, max_count = -1
373387
Py_ssize_t nkeys, k, j = 0
374388

375-
keys, counts = value_count(values, dropna)
389+
keys, counts = value_count(values, dropna, mask=mask)
376390
nkeys = len(keys)
377391

378392
modes = np.empty(nkeys, dtype=values.dtype)

pandas/core/algorithms.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -858,12 +858,15 @@ def value_counts(
858858

859859

860860
# Called once from SparseArray, otherwise could be private
861-
def value_counts_arraylike(values: np.ndarray, dropna: bool):
861+
def value_counts_arraylike(
862+
values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None
863+
):
862864
"""
863865
Parameters
864866
----------
865867
values : np.ndarray
866868
dropna : bool
869+
mask : np.ndarray[bool] or None, default None
867870
868871
Returns
869872
-------
@@ -873,7 +876,7 @@ def value_counts_arraylike(values: np.ndarray, dropna: bool):
873876
original = values
874877
values = _ensure_data(values)
875878

876-
keys, counts = htable.value_count(values, dropna)
879+
keys, counts = htable.value_count(values, dropna, mask=mask)
877880

878881
if needs_i8_conversion(original.dtype):
879882
# datetime, timedelta, or period
@@ -911,7 +914,9 @@ def duplicated(
911914
return htable.duplicated(values, keep=keep)
912915

913916

914-
def mode(values: ArrayLike, dropna: bool = True) -> ArrayLike:
917+
def mode(
918+
values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None
919+
) -> ArrayLike:
915920
"""
916921
Returns the mode(s) of an array.
917922
@@ -937,7 +942,7 @@ def mode(values: ArrayLike, dropna: bool = True) -> ArrayLike:
937942

938943
values = _ensure_data(values)
939944

940-
npresult = htable.mode(values, dropna=dropna)
945+
npresult = htable.mode(values, dropna=dropna, mask=mask)
941946
try:
942947
npresult = np.sort(npresult)
943948
except TypeError as err:

pandas/core/arrays/categorical.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
from pandas._libs import (
2727
NaT,
2828
algos as libalgos,
29-
hashtable as htable,
3029
lib,
3130
)
3231
from pandas._libs.arrays import NDArrayBacked
@@ -2255,14 +2254,15 @@ def mode(self, dropna: bool = True) -> Categorical:
22552254

22562255
def _mode(self, dropna: bool = True) -> Categorical:
22572256
codes = self._codes
2257+
mask = None
22582258
if dropna:
2259-
good = self._codes != -1
2260-
codes = self._codes[good]
2259+
mask = self.isna()
22612260

2262-
codes = htable.mode(codes, dropna)
2263-
codes.sort()
2264-
codes = coerce_indexer_dtype(codes, self.dtype.categories)
2265-
return self._from_backing_data(codes)
2261+
res_codes = algorithms.mode(codes, mask=mask)
2262+
res_codes = cast(np.ndarray, res_codes)
2263+
assert res_codes.dtype == codes.dtype
2264+
res = self._from_backing_data(res_codes)
2265+
return res
22662266

22672267
# ------------------------------------------------------------------
22682268
# ExtensionArray Interface

pandas/core/arrays/masked.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
)
5858

5959
from pandas.core import (
60+
algorithms as algos,
6061
arraylike,
6162
missing,
6263
nanops,
@@ -907,6 +908,15 @@ def value_counts(self, dropna: bool = True) -> Series:
907908
)
908909
from pandas.arrays import IntegerArray
909910

911+
if dropna:
912+
keys, counts = algos.value_counts_arraylike(
913+
self._data, dropna=True, mask=self._mask
914+
)
915+
res = Series(counts, index=keys)
916+
res.index = res.index.astype(self.dtype)
917+
res = res.astype("Int64")
918+
return res
919+
910920
# compute counts on the data with no nans
911921
data = self._data[~self._mask]
912922
value_counts = Index(data).value_counts()

0 commit comments

Comments
 (0)