Skip to content

Commit b50af20

Browse files
committed
BUG: float64 hash table for handling NAs in Series.unique, close #714
1 parent 271407c commit b50af20

File tree

4 files changed

+90
-6
lines changed

4 files changed

+90
-6
lines changed

pandas/core/series.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -814,11 +814,16 @@ def unique(self):
814814
uniques : ndarray
815815
"""
816816
values = self.values
817-
if not values.dtype == np.object_:
818-
values = values.astype('O')
819-
table = lib.PyObjectHashTable(len(values))
820-
uniques = lib.list_to_object_array(table.unique(values))
821-
return lib.maybe_convert_objects(uniques)
817+
if issubclass(values.dtype.type, np.floating):
818+
table = lib.Float64HashTable(len(values))
819+
uniques = np.array(table.unique(values), dtype='f8')
820+
else:
821+
if not values.dtype == np.object_:
822+
values = values.astype('O')
823+
table = lib.PyObjectHashTable(len(values))
824+
uniques = lib.list_to_object_array(table.unique(values))
825+
uniques = lib.maybe_convert_objects(uniques)
826+
return uniques
822827

823828
def nunique(self):
824829
"""

pandas/src/hashtable.pyx

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,55 @@ cdef class Int64HashTable:
579579

580580
return uniques
581581

582+
ONAN = np.nan
583+
584+
cdef class Float64HashTable:
585+
586+
cdef:
587+
kh_float64_t *table
588+
589+
def __init__(self, size_hint=1):
590+
if size_hint is not None:
591+
kh_resize_float64(self.table, size_hint)
592+
593+
def __cinit__(self):
594+
self.table = kh_init_float64()
595+
596+
def __dealloc__(self):
597+
kh_destroy_float64(self.table)
598+
599+
def factorize(self, ndarray[object] values):
600+
reverse = {}
601+
labels, counts = self.get_labels(values, reverse, 0)
602+
return reverse, labels, counts
603+
604+
def unique(self, ndarray[float64_t] values):
605+
cdef:
606+
Py_ssize_t i, n = len(values)
607+
Py_ssize_t idx, count = 0
608+
int ret
609+
float64_t val
610+
khiter_t k
611+
list uniques = []
612+
bint seen_na = 0
613+
614+
# TODO: kvec
615+
616+
for i in range(n):
617+
val = values[i]
618+
619+
if val == val:
620+
k = kh_get_float64(self.table, val)
621+
if k == self.table.n_buckets:
622+
k = kh_put_float64(self.table, val, &ret)
623+
uniques.append(val)
624+
count += 1
625+
elif not seen_na:
626+
seen_na = 1
627+
uniques.append(ONAN)
628+
629+
return uniques
630+
582631
cdef class PyObjectHashTable:
583632

584633
cdef:

pandas/src/khash.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ int main() {
112112
#include <stdlib.h>
113113
#include <string.h>
114114
#include <limits.h>
115+
#include <Python.h>
115116

116117
/* compipler specific configuration */
117118

@@ -129,6 +130,8 @@ typedef unsigned long long khuint64_t;
129130
typedef signed long long khint64_t;
130131
#endif
131132

133+
typedef double khfloat64_t;
134+
132135
#ifndef PANDAS_INLINE
133136
#if defined(__GNUC__)
134137
#define PANDAS_INLINE __inline__
@@ -346,6 +349,12 @@ static const double __ac_HASH_UPPER = 0.77;
346349
@abstract 64-bit integer comparison function
347350
*/
348351
#define kh_int64_hash_equal(a, b) ((a) == (b))
352+
353+
// kludge
354+
355+
#define kh_float64_hash_func _Py_HashDouble
356+
#define kh_float64_hash_equal kh_int64_hash_equal
357+
349358
/*! @function
350359
@abstract const char* hash function
351360
@param s Pointer to a null terminated string
@@ -544,6 +553,9 @@ static PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key)
544553
#define KHASH_MAP_INIT_INT64(name, khval_t) \
545554
KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
546555

556+
#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \
557+
KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_float64_hash_equal)
558+
547559
typedef const char *kh_cstr_t;
548560
/*! @function
549561
@abstract Instantiate a hash map containing const char* keys
@@ -584,12 +596,14 @@ KHASH_SET_INIT_PYOBJECT(pyset)
584596
#define kh_exist_pymap(h, k) (kh_exist(h, k))
585597
#define kh_exist_pyset(h, k) (kh_exist(h, k))
586598
#define kh_exist_str(h, k) (kh_exist(h, k))
599+
#define kh_exist_float64(h, k) (kh_exist(h, k))
587600
#define kh_exist_int64(h, k) (kh_exist(h, k))
588601
#define kh_exist_int32(h, k) (kh_exist(h, k))
589602

590603
KHASH_MAP_INIT_STR(str, Py_ssize_t)
591604

592605
KHASH_MAP_INIT_INT(int32, Py_ssize_t)
593606
KHASH_MAP_INIT_INT64(int64, Py_ssize_t)
607+
KHASH_MAP_INIT_FLOAT64(float64, Py_ssize_t)
594608

595609
#endif /* __AC_KHASH_H */

pandas/src/khash.pxd

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from cpython cimport PyObject
2-
from numpy cimport int64_t, int32_t, uint32_t
2+
from numpy cimport int64_t, int32_t, uint32_t, float64_t
33

44
cdef extern from "khash.h":
55
ctypedef uint32_t khint_t
@@ -71,6 +71,22 @@ cdef extern from "khash.h":
7171

7272
bint kh_exist_int64(kh_int64_t*, khiter_t)
7373

74+
ctypedef struct kh_float64_t:
75+
khint_t n_buckets, size, n_occupied, upper_bound
76+
uint32_t *flags
77+
float64_t *keys
78+
Py_ssize_t *vals
79+
80+
inline kh_float64_t* kh_init_float64()
81+
inline void kh_destroy_float64(kh_float64_t*)
82+
inline void kh_clear_float64(kh_float64_t*)
83+
inline khint_t kh_get_float64(kh_float64_t*, float64_t)
84+
inline void kh_resize_float64(kh_float64_t*, khint_t)
85+
inline khint_t kh_put_float64(kh_float64_t*, float64_t, int*)
86+
inline void kh_del_float64(kh_float64_t*, khint_t)
87+
88+
bint kh_exist_float64(kh_float64_t*, khiter_t)
89+
7490
ctypedef struct kh_int32_t:
7591
khint_t n_buckets, size, n_occupied, upper_bound
7692
uint32_t *flags

0 commit comments

Comments
 (0)