Skip to content

Commit 109322a

Browse files
committed
update Float64Vector
1 parent 38c9920 commit 109322a

File tree

1 file changed

+96
-66
lines changed

1 file changed

+96
-66
lines changed

pandas/hashtable.pyx

Lines changed: 96 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -35,23 +35,6 @@ cdef extern from "Python.h":
3535

3636
cdef size_t _INIT_VEC_CAP = 32
3737

38-
def list_to_object_array(list obj):
39-
'''
40-
Convert list to object ndarray. Seriously can't believe I had to write this
41-
function
42-
'''
43-
cdef:
44-
Py_ssize_t i, n
45-
ndarray[object] arr
46-
47-
n = len(obj)
48-
arr = np.empty(n, dtype=object)
49-
50-
for i from 0 <= i < n:
51-
arr[i] = obj[i]
52-
53-
return arr
54-
5538
cdef class Vector:
5639
pass
5740

@@ -68,6 +51,9 @@ cdef class ObjectVector(Vector):
6851
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
6952
self.data = <PyObject**> self.ao.data
7053

54+
def __len__(self):
55+
return self.n
56+
7157
cdef inline append(self, object o):
7258
if self.n == self.m:
7359
self.m = max(self.m * 2, _INIT_VEC_CAP)
@@ -132,37 +118,53 @@ cdef class Int64Vector:
132118

133119
Int64VectorData_append(self.data, x)
134120

121+
ctypedef struct Float64VectorData:
122+
float64_t *data
123+
size_t n, m
124+
125+
cdef uint8_t Float64VectorData_needs_resize(Float64VectorData *data) nogil:
126+
return data.n == data.m
127+
128+
cdef void Float64VectorData_append(Float64VectorData *data, float64_t x) nogil:
129+
130+
data.data[data.n] = x
131+
data.n += 1
132+
135133
cdef class Float64Vector(Vector):
136134

137135
cdef:
138-
float64_t *data
139-
size_t n, m
136+
Float64VectorData *data
140137
ndarray ao
141138

142139
def __cinit__(self):
143-
self.n = 0
144-
self.m = _INIT_VEC_CAP
145-
self.ao = np.empty(_INIT_VEC_CAP, dtype=np.float64)
146-
self.data = <float64_t*> self.ao.data
140+
self.data = <Float64VectorData *>PyMem_Malloc(sizeof(Float64VectorData))
141+
self.data.n = 0
142+
self.data.m = _INIT_VEC_CAP
143+
self.ao = np.empty(self.data.m, dtype=np.float64)
144+
self.data.data = <float64_t*> self.ao.data
147145

148146
cdef resize(self):
149-
self.m = max(self.m * 2, _INIT_VEC_CAP)
150-
self.ao.resize(self.m)
151-
self.data = <float64_t*> self.ao.data
147+
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
148+
self.ao.resize(self.data.m)
149+
self.data.data = <float64_t*> self.ao.data
152150

153-
cdef inline void append(self, float64_t x) nogil:
154-
if self.n == self.m:
155-
with gil:
156-
self.resize()
151+
def __dealloc__(self):
152+
PyMem_Free(self.data)
157153

158-
self.data[self.n] = x
159-
self.n += 1
154+
def __len__(self):
155+
return self.data.n
160156

161157
def to_array(self):
162-
self.ao.resize(self.n)
163-
self.m = self.n
158+
self.ao.resize(self.data.n)
159+
self.data.m = self.data.n
164160
return self.ao
165161

162+
cdef inline void append(self, float64_t x):
163+
164+
if Float64VectorData_needs_resize(self.data):
165+
self.resize()
166+
167+
Float64VectorData_append(self.data, x)
166168

167169
cdef class HashTable:
168170
pass
@@ -459,13 +461,21 @@ cdef class Int64HashTable(HashTable):
459461
int64_t val
460462
khiter_t k
461463
Int64Vector uniques = Int64Vector()
464+
Int64VectorData *ud
462465

463-
for i in range(n):
464-
val = values[i]
465-
k = kh_get_int64(self.table, val)
466-
if k == self.table.n_buckets:
467-
kh_put_int64(self.table, val, &ret)
468-
uniques.append(val)
466+
ud = uniques.data
467+
468+
with nogil:
469+
for i in range(n):
470+
val = values[i]
471+
k = kh_get_int64(self.table, val)
472+
if k == self.table.n_buckets:
473+
kh_put_int64(self.table, val, &ret)
474+
475+
if Int64VectorData_needs_resize(ud):
476+
with gil:
477+
uniques.resize()
478+
Int64VectorData_append(ud, val)
469479

470480
result = uniques.to_array()
471481

@@ -526,26 +536,33 @@ cdef class Float64HashTable(HashTable):
526536
int ret = 0
527537
float64_t val
528538
khiter_t k
539+
Float64VectorData *ud
529540

530541
labels = np.empty(n, dtype=np.int64)
542+
ud = uniques.data
531543

532-
for i in range(n):
533-
val = values[i]
544+
with nogil:
545+
for i in range(n):
546+
val = values[i]
534547

535-
if val != val:
536-
labels[i] = na_sentinel
537-
continue
548+
if val != val:
549+
labels[i] = na_sentinel
550+
continue
538551

539-
k = kh_get_float64(self.table, val)
540-
if k != self.table.n_buckets:
541-
idx = self.table.vals[k]
542-
labels[i] = idx
543-
else:
544-
k = kh_put_float64(self.table, val, &ret)
545-
self.table.vals[k] = count
546-
uniques.append(val)
547-
labels[i] = count
548-
count += 1
552+
k = kh_get_float64(self.table, val)
553+
if k != self.table.n_buckets:
554+
idx = self.table.vals[k]
555+
labels[i] = idx
556+
else:
557+
k = kh_put_float64(self.table, val, &ret)
558+
self.table.vals[k] = count
559+
560+
if Float64VectorData_needs_resize(ud):
561+
with gil:
562+
uniques.resize()
563+
Float64VectorData_append(ud, val)
564+
labels[i] = count
565+
count += 1
549566

550567
return labels
551568

@@ -588,20 +605,33 @@ cdef class Float64HashTable(HashTable):
588605
int ret = 0
589606
float64_t val
590607
khiter_t k
591-
Float64Vector uniques = Float64Vector()
592608
bint seen_na = 0
609+
Float64Vector uniques = Float64Vector()
610+
Float64VectorData *ud
593611

594-
for i in range(n):
595-
val = values[i]
612+
ud = uniques.data
596613

597-
if val == val:
598-
k = kh_get_float64(self.table, val)
599-
if k == self.table.n_buckets:
600-
kh_put_float64(self.table, val, &ret)
601-
uniques.append(val)
602-
elif not seen_na:
603-
seen_na = 1
604-
uniques.append(NAN)
614+
with nogil:
615+
for i in range(n):
616+
val = values[i]
617+
618+
if val == val:
619+
k = kh_get_float64(self.table, val)
620+
if k == self.table.n_buckets:
621+
kh_put_float64(self.table, val, &ret)
622+
623+
if Float64VectorData_needs_resize(ud):
624+
with gil:
625+
uniques.resize()
626+
Float64VectorData_append(ud, val)
627+
628+
elif not seen_na:
629+
seen_na = 1
630+
631+
if Float64VectorData_needs_resize(ud):
632+
with gil:
633+
uniques.resize()
634+
Float64VectorData_append(ud, NAN)
605635

606636
return uniques.to_array()
607637

0 commit comments

Comments
 (0)