Skip to content

Commit 40b451d

Browse files
committed
use value_counts for mode
1 parent 309cf3a commit 40b451d

File tree

2 files changed

+51
-104
lines changed

2 files changed

+51
-104
lines changed

pandas/_libs/hashtable_func_helper.pxi.in

Lines changed: 32 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -28,52 +28,6 @@ dtypes = [('Complex128', 'complex128', 'complex128',
2828
{{for name, dtype, ttype, c_type, to_c_type in dtypes}}
2929

3030

31-
@cython.wraparound(False)
32-
@cython.boundscheck(False)
33-
{{if dtype == 'object'}}
34-
cdef build_count_table_{{dtype}}(ndarray[{{dtype}}] values,
35-
kh_{{ttype}}_t *table, bint dropna):
36-
{{else}}
37-
cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values,
38-
kh_{{ttype}}_t *table, bint dropna):
39-
{{endif}}
40-
cdef:
41-
khiter_t k
42-
Py_ssize_t i, n = len(values)
43-
44-
{{c_type}} val
45-
46-
int ret = 0
47-
48-
{{if dtype == 'object'}}
49-
kh_resize_{{ttype}}(table, n // 10)
50-
51-
for i in range(n):
52-
val = values[i]
53-
if not checknull(val) or not dropna:
54-
k = kh_get_{{ttype}}(table, <PyObject*>val)
55-
if k != table.n_buckets:
56-
table.vals[k] += 1
57-
else:
58-
k = kh_put_{{ttype}}(table, <PyObject*>val, &ret)
59-
table.vals[k] = 1
60-
{{else}}
61-
with nogil:
62-
kh_resize_{{ttype}}(table, n)
63-
64-
for i in range(n):
65-
val = {{to_c_type}}(values[i])
66-
67-
if not is_nan_{{c_type}}(val) or not dropna:
68-
k = kh_get_{{ttype}}(table, val)
69-
if k != table.n_buckets:
70-
table.vals[k] += 1
71-
else:
72-
k = kh_put_{{ttype}}(table, val, &ret)
73-
table.vals[k] = 1
74-
{{endif}}
75-
76-
7731
@cython.wraparound(False)
7832
@cython.boundscheck(False)
7933
{{if dtype == 'object'}}
@@ -294,78 +248,42 @@ def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
294248
kh_destroy_{{ttype}}(table)
295249
return result.view(np.bool_)
296250

297-
{{endfor}}
298-
299-
300251
# ----------------------------------------------------------------------
301252
# Mode Computations
302253
# ----------------------------------------------------------------------
303254

304-
{{py:
305-
306-
# dtype, ctype, table_type, npy_dtype
307-
dtypes = [('complex128', 'khcomplex128_t', 'complex128', 'complex128'),
308-
('complex64', 'khcomplex64_t', 'complex64', 'complex64'),
309-
('float64', 'float64_t', 'float64', 'float64'),
310-
('float32', 'float32_t', 'float32', 'float32'),
311-
('int64', 'int64_t', 'int64', 'int64'),
312-
('int32', 'int32_t', 'int32', 'int32'),
313-
('int16', 'int16_t', 'int16', 'int16'),
314-
('int8', 'int8_t', 'int8', 'int8'),
315-
('uint64', 'uint64_t', 'uint64', 'uint64'),
316-
('uint32', 'uint32_t', 'uint32', 'uint32'),
317-
('uint16', 'uint16_t', 'uint16', 'uint16'),
318-
('uint8', 'uint8_t', 'uint8', 'uint8'),
319-
('object', 'object', 'pymap', 'object_')]
320-
}}
321-
322-
{{for dtype, ctype, table_type, npy_dtype in dtypes}}
323-
324255

325256
@cython.wraparound(False)
326257
@cython.boundscheck(False)
327-
328258
{{if dtype == 'object'}}
329-
330-
331-
def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna):
259+
def mode_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
332260
{{else}}
333-
334-
335261
def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
336262
{{endif}}
337263
cdef:
338-
int count, max_count = 1
339-
int j = -1 # so you can do +=
340-
# Don't use Py_ssize_t, since table.n_buckets is unsigned
341-
khiter_t k
342-
kh_{{table_type}}_t *table
343-
ndarray[{{ctype}}] modes
264+
{{if dtype == 'object'}}
265+
ndarray[{{dtype}}] keys
266+
ndarray[{{dtype}}] modes
267+
{{else}}
268+
{{dtype}}_t[:] keys
269+
ndarray[{{dtype}}_t] modes
270+
{{endif}}
271+
int64_t[:] counts
272+
int64_t count, max_count = -1
273+
Py_ssize_t k, j = 0
344274

345-
table = kh_init_{{table_type}}()
346-
build_count_table_{{dtype}}(values, table, dropna)
275+
keys, counts = value_count_{{dtype}}(values, dropna)
347276

348-
modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}})
277+
{{if dtype == 'object'}}
278+
modes = np.empty(len(keys), dtype=np.object_)
279+
{{else}}
280+
modes = np.empty(len(keys), dtype=np.{{dtype}})
281+
{{endif}}
349282

350283
{{if dtype != 'object'}}
351284
with nogil:
352-
for k in range(table.n_buckets):
353-
if kh_exist_{{table_type}}(table, k):
354-
count = table.vals[k]
355-
if count == max_count:
356-
j += 1
357-
elif count > max_count:
358-
max_count = count
359-
j = 0
360-
else:
361-
continue
362-
363-
modes[j] = table.keys[k]
364-
{{else}}
365-
for k in range(table.n_buckets):
366-
if kh_exist_{{table_type}}(table, k):
367-
count = table.vals[k]
368-
285+
for k in range(len(keys)):
286+
count = counts[k]
369287
if count == max_count:
370288
j += 1
371289
elif count > max_count:
@@ -374,11 +292,21 @@ def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
374292
else:
375293
continue
376294

377-
modes[j] = <object>table.keys[k]
295+
modes[j] = keys[k]
296+
{{else}}
297+
for k in range(len(keys)):
298+
count = counts[k]
299+
if count == max_count:
300+
j += 1
301+
elif count > max_count:
302+
max_count = count
303+
j = 0
304+
else:
305+
continue
306+
307+
modes[j] = keys[k]
378308
{{endif}}
379309

380-
kh_destroy_{{table_type}}(table)
381-
382310
return modes[:j + 1]
383311

384312
{{endfor}}

pandas/tests/libs/test_hashtable.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
import pytest
66

77
from pandas._libs import hashtable as ht
8+
from pandas._libs.missing import checknull
89

10+
import pandas as pd
911
import pandas._testing as tm
1012

1113

@@ -323,6 +325,23 @@ def test_mode(self, dtype, type_suffix, writable):
323325
result = mode(values, False)
324326
assert result == 42
325327

328+
def test_mode_stable(self, dtype, type_suffix, writable):
329+
mode = get_ht_function("mode", type_suffix)
330+
values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
331+
values.flags.writeable = writable
332+
keys = mode(values, False)
333+
tm.assert_numpy_array_equal(keys, values)
334+
335+
336+
def test_modes_with_nans():
337+
# GH39007
338+
values = np.array([True, pd.NA, np.nan], dtype=np.object_)
339+
# pd.Na and np.nan will have the same representative: np.nan
340+
# thus we have 2 nans and 1 True
341+
modes = ht.mode_object(values, False)
342+
assert modes.size == 1
343+
assert checknull(modes[0])
344+
326345

327346
@pytest.mark.parametrize(
328347
"dtype, type_suffix",

0 commit comments

Comments
 (0)