Skip to content

Commit e734755

Browse files
committed
BUG: don't use kvec because numpy may use private heap
1 parent deb3676 commit e734755

File tree

3 files changed

+64
-124
lines changed

3 files changed

+64
-124
lines changed

pandas/core/algorithms.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
125125

126126
labels = com._ensure_platform_int(labels)
127127

128-
uniques = uniques.to_array(xfer_data=True)
128+
uniques = uniques.to_array()
129129

130130
if sort and len(uniques) > 0:
131131
sorter = uniques.argsort()

pandas/src/hashtable.pyx

Lines changed: 62 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from cpython cimport PyObject
1+
from cpython cimport PyObject, Py_INCREF
22

33
from khash cimport *
44
from numpy cimport *
@@ -29,144 +29,96 @@ def list_to_object_array(list obj):
2929
return arr
3030

3131

32-
cdef extern from "kvec.h":
33-
34-
ctypedef struct kv_int64_t:
35-
size_t n, m
36-
int64_t* a
37-
38-
ctypedef struct kv_double:
39-
size_t n, m
40-
double* a
41-
42-
ctypedef struct kv_object_t:
43-
size_t n, m
44-
PyObject** a
45-
46-
inline void kv_object_push(kv_object_t *v, PyObject* x)
47-
inline void kv_object_destroy(kv_object_t *v)
48-
inline void kv_int64_push(kv_int64_t *v, int64_t x)
49-
inline void kv_double_push(kv_double *v, double x)
50-
32+
cdef size_t _INIT_VEC_CAP = 32
5133

5234
cdef class ObjectVector:
5335

5436
cdef:
55-
bint owndata
56-
kv_object_t vec
37+
size_t n, m
38+
ndarray ao
39+
PyObject **data
5740

5841
def __cinit__(self):
59-
self.owndata = 1
42+
self.n = 0
43+
self.m = _INIT_VEC_CAP
44+
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
45+
self.data = <PyObject**> self.ao.data
6046

6147
def __len__(self):
62-
return self.vec.n
63-
64-
def to_array(self, xfer_data=True):
65-
""" Here we use the __array__ method, that is called when numpy
66-
tries to get an array from the object."""
67-
cdef:
68-
npy_intp shape[1]
69-
ndarray result
70-
71-
shape[0] = <npy_intp> self.vec.n
72-
73-
# Create a 1D array, of length 'size'
74-
result = PyArray_SimpleNewFromData(1, shape,
75-
np.NPY_OBJECT, self.vec.a)
76-
77-
# urgh, mingw32 barfs because of this
48+
return self.n
7849

79-
if xfer_data:
80-
self.owndata = 0
81-
util.set_array_owndata(result)
82-
83-
# return result
84-
85-
return result.copy()
50+
def to_array(self):
51+
self.ao.resize(self.n)
52+
return self.ao
8653

8754
cdef inline append(self, object o):
88-
kv_object_push(&self.vec, <PyObject*> o)
55+
if self.n == self.m:
56+
self.m = self.m * 2
57+
self.ao.resize(self.m)
58+
self.data = <PyObject**> self.ao.data
8959

90-
def __dealloc__(self):
91-
if self.owndata:
92-
kv_object_destroy(&self.vec)
60+
Py_INCREF(o)
61+
self.data[self.n] = <PyObject*> o
62+
self.n += 1
9363

9464

9565
cdef class Int64Vector:
9666

9767
cdef:
98-
bint owndata
99-
kv_int64_t vec
68+
size_t n, m
69+
ndarray ao
70+
int64_t *data
10071

10172
def __cinit__(self):
102-
self.owndata = 1
73+
self.n = 0
74+
self.m = _INIT_VEC_CAP
75+
self.ao = np.empty(_INIT_VEC_CAP, dtype=np.int64)
76+
self.data = <int64_t*> self.ao.data
10377

10478
def __len__(self):
105-
return self.vec.n
106-
107-
def to_array(self, xfer_data=True):
108-
""" Here we use the __array__ method, that is called when numpy
109-
tries to get an array from the object."""
110-
cdef:
111-
npy_intp shape[1]
112-
ndarray result
113-
114-
shape[0] = <npy_intp> self.vec.n
115-
116-
# Create a 1D array, of length 'size'
117-
result = PyArray_SimpleNewFromData(1, shape, np.NPY_INT64,
118-
self.vec.a)
119-
120-
if xfer_data:
121-
self.owndata = 0
122-
util.set_array_owndata(result)
79+
return self.n
12380

124-
return result
81+
def to_array(self):
82+
self.ao.resize(self.n)
83+
return self.ao
12584

12685
cdef inline append(self, int64_t x):
127-
kv_int64_push(&self.vec, x)
86+
if self.n == self.m:
87+
self.m = self.m * 2
88+
self.ao.resize(self.m)
89+
self.data = <int64_t*> self.ao.data
12890

129-
def __dealloc__(self):
130-
if self.owndata:
131-
free(self.vec.a)
91+
self.data[self.n] = x
92+
self.n += 1
13293

13394
cdef class Float64Vector:
13495

13596
cdef:
136-
bint owndata
137-
kv_double vec
97+
size_t n, m
98+
ndarray ao
99+
float64_t *data
138100

139101
def __cinit__(self):
140-
self.owndata = 1
102+
self.n = 0
103+
self.m = _INIT_VEC_CAP
104+
self.ao = np.empty(_INIT_VEC_CAP, dtype=np.float64)
105+
self.data = <float64_t*> self.ao.data
141106

142107
def __len__(self):
143-
return self.vec.n
144-
145-
def to_array(self, xfer_data=True):
146-
""" Here we use the __array__ method, that is called when numpy
147-
tries to get an array from the object."""
148-
cdef:
149-
npy_intp shape[1]
150-
ndarray result
151-
152-
shape[0] = <npy_intp> self.vec.n
153-
154-
# Create a 1D array, of length 'size'
155-
result = PyArray_SimpleNewFromData(1, shape, np.NPY_FLOAT64,
156-
self.vec.a)
108+
return self.n
157109

158-
if xfer_data:
159-
self.owndata = 0
160-
util.set_array_owndata(result)
161-
162-
return result
110+
def to_array(self):
111+
self.ao.resize(self.n)
112+
return self.ao
163113

164114
cdef inline append(self, float64_t x):
165-
kv_double_push(&self.vec, x)
115+
if self.n == self.m:
116+
self.m = self.m * 2
117+
self.ao.resize(self.m)
118+
self.data = <float64_t*> self.ao.data
166119

167-
def __dealloc__(self):
168-
if self.owndata:
169-
free(self.vec.a)
120+
self.data[self.n] = x
121+
self.n += 1
170122

171123

172124
cdef class HashTable:
@@ -262,7 +214,7 @@ cdef class StringHashTable(HashTable):
262214
uniques.append(val)
263215

264216
# return None
265-
return uniques.to_array(xfer_data=True)
217+
return uniques.to_array()
266218

267219
def factorize(self, ndarray[object] values):
268220
cdef:
@@ -573,7 +525,7 @@ cdef class Int64HashTable(HashTable):
573525
labels[i] = count
574526
count += 1
575527

576-
arr_uniques = uniques.to_array(xfer_data=True)
528+
arr_uniques = uniques.to_array()
577529

578530
return labels, arr_uniques
579531

@@ -587,8 +539,6 @@ cdef class Int64HashTable(HashTable):
587539
khiter_t k
588540
Int64Vector uniques = Int64Vector()
589541

590-
# TODO: kvec
591-
592542
for i in range(n):
593543
val = values[i]
594544
k = kh_get_int64(self.table, val)
@@ -597,11 +547,7 @@ cdef class Int64HashTable(HashTable):
597547
uniques.append(val)
598548
count += 1
599549

600-
result = uniques.to_array(xfer_data=True)
601-
602-
# result = np.array(uniques, copy=False)
603-
# result.base = <PyObject*> uniques
604-
# Py_INCREF(uniques)
550+
result = uniques.to_array()
605551

606552
return result
607553

@@ -625,7 +571,7 @@ cdef class Float64HashTable(HashTable):
625571
def factorize(self, ndarray[float64_t] values):
626572
uniques = Float64Vector()
627573
labels = self.get_labels(values, uniques, 0, -1)
628-
return uniques.to_array(xfer_data=True), labels
574+
return uniques.to_array(), labels
629575

630576
cpdef get_labels(self, ndarray[float64_t] values,
631577
Float64Vector uniques,
@@ -698,8 +644,6 @@ cdef class Float64HashTable(HashTable):
698644
Float64Vector uniques = Float64Vector()
699645
bint seen_na = 0
700646

701-
# TODO: kvec
702-
703647
for i in range(n):
704648
val = values[i]
705649

@@ -713,7 +657,7 @@ cdef class Float64HashTable(HashTable):
713657
seen_na = 1
714658
uniques.append(ONAN)
715659

716-
return uniques.to_array(xfer_data=True)
660+
return uniques.to_array()
717661

718662
cdef class PyObjectHashTable(HashTable):
719663
cdef kh_pymap_t *table
@@ -839,11 +783,7 @@ cdef class PyObjectHashTable(HashTable):
839783
seen_na = 1
840784
uniques.append(ONAN)
841785

842-
result = uniques.to_array(xfer_data=True)
843-
844-
# result = np.array(uniques, copy=False)
845-
# result.base = <PyObject*> uniques
846-
# Py_INCREF(uniques)
786+
result = uniques.to_array()
847787

848788
return result
849789

@@ -903,7 +843,7 @@ cdef class Factorizer:
903843
if labels.dtype != np.int_:
904844
labels = labels.astype(np.int_)
905845

906-
sorter = self.uniques.to_array(xfer_data=False).argsort()
846+
sorter = self.uniques.to_array().argsort()
907847
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
908848
reverse_indexer.put(sorter, np.arange(len(sorter)))
909849

@@ -940,7 +880,7 @@ cdef class Int64Factorizer:
940880
if labels.dtype != np.int_:
941881
labels = labels.astype(np.int_)
942882

943-
sorter = self.uniques.to_array(xfer_data=False).argsort()
883+
sorter = self.uniques.to_array().argsort()
944884
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
945885
reverse_indexer.put(sorter, np.arange(len(sorter)))
946886

pandas/tools/merge.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,7 @@ def _factorize_keys(lk, rk, sort=True):
572572
count = rizer.get_count()
573573

574574
if sort:
575-
uniques = rizer.uniques.to_array(xfer_data=False)
575+
uniques = rizer.uniques.to_array()
576576
llab, rlab = _sort_labels(uniques, llab, rlab)
577577

578578
# NA group

0 commit comments

Comments
 (0)