update Float64Vector

jreback · jreback · commit 109322abd9f3 · 2015-05-28T13:57:53.000-04:00
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -35,23 +35,6 @@ cdef extern from "Python.h":
 
 cdef size_t _INIT_VEC_CAP = 32
 
-def list_to_object_array(list obj):
-    '''
-    Convert list to object ndarray. Seriously can't believe I had to write this
-    function
-    '''
-    cdef:
-        Py_ssize_t i, n
-        ndarray[object] arr
-
-    n = len(obj)
-    arr = np.empty(n, dtype=object)
-
-    for i from 0 <= i < n:
-        arr[i] = obj[i]
-
-    return arr
-
 cdef class Vector:
     pass
 
@@ -68,6 +51,9 @@ cdef class ObjectVector(Vector):
         self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
         self.data = <PyObject**> self.ao.data
 
+    def __len__(self):
+        return self.n
+
     cdef inline append(self, object o):
         if self.n == self.m:
             self.m = max(self.m * 2, _INIT_VEC_CAP)
@@ -132,37 +118,53 @@ cdef class Int64Vector:
 
         Int64VectorData_append(self.data, x)
 
+ctypedef struct Float64VectorData:
+    float64_t *data
+    size_t n, m
+
+cdef uint8_t Float64VectorData_needs_resize(Float64VectorData *data) nogil:
+    return data.n == data.m
+
+cdef void Float64VectorData_append(Float64VectorData *data, float64_t x) nogil:
+
+    data.data[data.n] = x
+    data.n += 1
+
 cdef class Float64Vector(Vector):
 
     cdef:
-        float64_t *data
-        size_t n, m
+        Float64VectorData *data
         ndarray ao
 
     def __cinit__(self):
-        self.n = 0
-        self.m = _INIT_VEC_CAP
-        self.ao = np.empty(_INIT_VEC_CAP, dtype=np.float64)
-        self.data = <float64_t*> self.ao.data
+        self.data = <Float64VectorData *>PyMem_Malloc(sizeof(Float64VectorData))
+        self.data.n = 0
+        self.data.m = _INIT_VEC_CAP
+        self.ao = np.empty(self.data.m, dtype=np.float64)
+        self.data.data = <float64_t*> self.ao.data
 
     cdef resize(self):
-        self.m = max(self.m * 2, _INIT_VEC_CAP)
-        self.ao.resize(self.m)
-        self.data = <float64_t*> self.ao.data
+        self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
+        self.ao.resize(self.data.m)
+        self.data.data = <float64_t*> self.ao.data
 
-    cdef inline void append(self, float64_t x) nogil:
-        if self.n == self.m:
-            with gil:
-                self.resize()
+    def __dealloc__(self):
+        PyMem_Free(self.data)
 
-        self.data[self.n] = x
-        self.n += 1
+    def __len__(self):
+        return self.data.n
 
     def to_array(self):
-        self.ao.resize(self.n)
-        self.m = self.n
+        self.ao.resize(self.data.n)
+        self.data.m = self.data.n
         return self.ao
 
+    cdef inline void append(self, float64_t x):
+
+        if Float64VectorData_needs_resize(self.data):
+            self.resize()
+
+        Float64VectorData_append(self.data, x)
 
 cdef class HashTable:
     pass
@@ -459,13 +461,21 @@ cdef class Int64HashTable(HashTable):
             int64_t val
             khiter_t k
             Int64Vector uniques = Int64Vector()
+            Int64VectorData *ud
 
-        for i in range(n):
-            val = values[i]
-            k = kh_get_int64(self.table, val)
-            if k == self.table.n_buckets:
-                kh_put_int64(self.table, val, &ret)
-                uniques.append(val)
+        ud = uniques.data
+
+        with nogil:
+            for i in range(n):
+                val = values[i]
+                k = kh_get_int64(self.table, val)
+                if k == self.table.n_buckets:
+                    kh_put_int64(self.table, val, &ret)
+
+                    if Int64VectorData_needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    Int64VectorData_append(ud, val)
 
         result = uniques.to_array()
 
@@ -526,26 +536,33 @@ cdef class Float64HashTable(HashTable):
             int ret = 0
             float64_t val
             khiter_t k
+            Float64VectorData *ud
 
         labels = np.empty(n, dtype=np.int64)
+        ud = uniques.data
 
-        for i in range(n):
-            val = values[i]
+        with nogil:
+            for i in range(n):
+                val = values[i]
 
-            if val != val:
-                labels[i] = na_sentinel
-                continue
+                if val != val:
+                    labels[i] = na_sentinel
+                    continue
 
-            k = kh_get_float64(self.table, val)
-            if k != self.table.n_buckets:
-                idx = self.table.vals[k]
-                labels[i] = idx
-            else:
-                k = kh_put_float64(self.table, val, &ret)
-                self.table.vals[k] = count
-                uniques.append(val)
-                labels[i] = count
-                count += 1
+                k = kh_get_float64(self.table, val)
+                if k != self.table.n_buckets:
+                    idx = self.table.vals[k]
+                    labels[i] = idx
+                else:
+                    k = kh_put_float64(self.table, val, &ret)
+                    self.table.vals[k] = count
+
+                    if Float64VectorData_needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    Float64VectorData_append(ud, val)
+                    labels[i] = count
+                    count += 1
 
         return labels
 
@@ -588,20 +605,33 @@ cdef class Float64HashTable(HashTable):
             int ret = 0
             float64_t val
             khiter_t k
-            Float64Vector uniques = Float64Vector()
             bint seen_na = 0
+            Float64Vector uniques = Float64Vector()
+            Float64VectorData *ud
 
-        for i in range(n):
-            val = values[i]
+        ud = uniques.data
 
-            if val == val:
-                k = kh_get_float64(self.table, val)
-                if k == self.table.n_buckets:
-                    kh_put_float64(self.table, val, &ret)
-                    uniques.append(val)
-            elif not seen_na:
-                seen_na = 1
-                uniques.append(NAN)
+        with nogil:
+            for i in range(n):
+                val = values[i]
+
+                if val == val:
+                    k = kh_get_float64(self.table, val)
+                    if k == self.table.n_buckets:
+                        kh_put_float64(self.table, val, &ret)
+
+                        if Float64VectorData_needs_resize(ud):
+                            with gil:
+                                uniques.resize()
+                        Float64VectorData_append(ud, val)
+
+                elif not seen_na:
+                    seen_na = 1
+
+                    if Float64VectorData_needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    Float64VectorData_append(ud, NAN)
 
         return uniques.to_array()