Merge pull request #4457 from jreback/hdf_opt

jreback · jreback · commit 8915ce675895 · 2013-08-05T05:06:32.000-07:00
PERF: enhance HDFStore Table writing performance
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -46,6 +46,7 @@ pandas 0.13
     ``read_table``, ``read_csv``, etc.
   - Added a more informative error message when plot arguments contain
     overlapping color and style arguments (:issue:`4402`)
+  - Significant table writing performance improvements in ``HDFStore``
 
 **API Changes**
 
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -32,6 +32,7 @@ API changes
   - ``Series.get`` with negative indexers now returns the same as ``[]`` (:issue:`4390`)
   - ``HDFStore``
 
+    - Significant table writing performance improvements
     - added an ``is_open`` property to indicate if the underlying file handle is_open;
       a closed store will now report 'CLOSED' when viewing the store (rather than raising an error)
       (:issue:`4409`)
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -1037,7 +1037,6 @@ class IndexCol(StringMixin):
         """
     is_an_indexable = True
     is_data_indexable = True
-    is_searchable = False
     _info_fields = ['freq','tz','index_name']
 
     def __init__(self, values=None, kind=None, typ=None, cname=None, itemsize=None,
@@ -1299,7 +1298,6 @@ class DataCol(IndexCol):
         """
     is_an_indexable = False
     is_data_indexable = False
-    is_searchable = False
     _info_fields = ['tz']
 
     @classmethod
@@ -1588,10 +1586,6 @@ class DataIndexableCol(DataCol):
     """ represent a data column that can be indexed """
     is_data_indexable = True
 
-    @property
-    def is_searchable(self):
-        return _ensure_decoded(self.kind) == u('string')
-
     def get_atom_string(self, block, itemsize):
         return _tables().StringCol(itemsize=itemsize)
 
@@ -3061,8 +3055,6 @@ def write_data(self, chunksize):
 
         # the arguments
         indexes = [a.cvalues for a in self.index_axes]
-        search = np.array(
-            [a.is_searchable for a in self.values_axes]).astype('u1')
         values = [a.take_data() for a in self.values_axes]
 
         # transpose the values so first dimension is last
@@ -3083,22 +3075,49 @@ def write_data(self, chunksize):
             self.write_data_chunk(
                 indexes=[a[start_i:end_i] for a in indexes],
                 mask=mask[start_i:end_i],
-                search=search,
                 values=[v[start_i:end_i] for v in values])
 
-    def write_data_chunk(self, indexes, mask, search, values):
+    def write_data_chunk(self, indexes, mask, values):
 
         # 0 len
         for v in values:
             if not np.prod(v.shape):
                 return
 
-        # get our function
         try:
-            func = getattr(lib, "create_hdf_rows_%sd" % self.ndim)
-            args = list(indexes)
-            args.extend([self.dtype, mask, search, values])
-            rows = func(*args)
+            nrows = np.prod([ idx.shape[0] for idx in indexes ])
+            rows = np.empty(nrows,dtype=self.dtype)
+            names = self.dtype.names
+
+            # indexes
+            nindexes = len(indexes)
+            for i, idx in enumerate(indexes):
+
+                # broadcast to all other indexes except myself
+                if i > 0 and i < nindexes:
+                    repeater = np.prod([indexes[bi].shape[0] for bi in range(0,i)])
+                    idx = np.tile(idx,repeater)
+
+                if i < nindexes-1:
+                    repeater = np.prod([indexes[bi].shape[0] for bi in range(i+1,nindexes)])
+                    idx = np.repeat(idx,repeater)
+
+                rows[names[i]] = idx
+
+            # values
+            for i, v in enumerate(values):
+                name = names[nindexes + i]
+                b = values[i]
+
+                # reshape
+                new_shape = (nrows,) + self.dtype[name].shape
+                b = b.ravel().reshape(new_shape)
+
+                rows[name] = b
+
+            # mask
+            rows = rows[~mask.ravel().astype(bool)]
+
         except Exception as detail:
             raise Exception("cannot create row-data -> %s" % str(detail))
 
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -842,157 +842,6 @@ def write_csv_rows(list data, list data_index, int nlevels, list cols, object wr
     if  j >= 0 and (j < N-1 or (j % N) != N-1 ):
         writer.writerows(rows[:((j+1) % N)])
 
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def create_hdf_rows_2d(ndarray indexer0,
-                       object dtype,
-                       ndarray[np.uint8_t, ndim=1] mask,
-                       ndarray[np.uint8_t, ndim=1] searchable,
-                       list values):
-    """ return a list of objects ready to be converted to rec-array format """
-
-    cdef:
-        int i, l, b, n_indexer0, n_blocks, tup_size
-        ndarray result
-        tuple tup
-        object v
-
-    n_indexer0 = indexer0.shape[0]
-    n_blocks   = len(values)
-    tup_size   = n_blocks+1
-
-    result = np.empty(n_indexer0,dtype=dtype)
-    l = 0
-    for i in range(n_indexer0):
-
-        if not mask[i]:
-
-            tup = PyTuple_New(tup_size)
-
-            v  = indexer0[i]
-            PyTuple_SET_ITEM(tup, 0, v)
-            Py_INCREF(v)
-
-            for b in range(n_blocks):
-
-                v = values[b][i]
-                if searchable[b]:
-                    v = v[0]
-
-                PyTuple_SET_ITEM(tup, b+1, v)
-                Py_INCREF(v)
-
-            result[l] = tup
-            l += 1
-
-    return result[0:l]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1,
-                       object dtype,
-                       ndarray[np.uint8_t, ndim=2] mask,
-                       ndarray[np.uint8_t, ndim=1] searchable,
-                       list values):
-    """ return a list of objects ready to be converted to rec-array format """
-
-    cdef:
-        int i, j, l, b, n_indexer0, n_indexer1, n_blocks, tup_size
-        tuple tup
-        object v
-        ndarray result
-
-    n_indexer0 = indexer0.shape[0]
-    n_indexer1 = indexer1.shape[0]
-    n_blocks   = len(values)
-    tup_size   = n_blocks+2
-    result = np.empty(n_indexer0*n_indexer1,dtype=dtype)
-    l = 0
-    for i from 0 <= i < n_indexer0:
-
-        for j from 0 <= j < n_indexer1:
-
-            if not mask[i, j]:
-
-                tup = PyTuple_New(tup_size)
-
-                v = indexer0[i]
-                PyTuple_SET_ITEM(tup, 0, v)
-                Py_INCREF(v)
-                v = indexer1[j]
-                PyTuple_SET_ITEM(tup, 1, v)
-                Py_INCREF(v)
-
-                for b from 0 <= b < n_blocks:
-
-                    v   = values[b][i, j]
-                    if searchable[b]:
-                        v = v[0]
-
-                    PyTuple_SET_ITEM(tup, b+2, v)
-                    Py_INCREF(v)
-
-                result[l] = tup
-                l += 1
-
-    return result[0:l]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2,
-                       object dtype,
-                       ndarray[np.uint8_t, ndim=3] mask,
-                       ndarray[np.uint8_t, ndim=1] searchable,
-                       list values):
-    """ return a list of objects ready to be converted to rec-array format """
-
-    cdef:
-        int i, j, k, l, b, n_indexer0, n_indexer1, n_indexer2, n_blocks, tup_size
-        tuple tup
-        object v
-        ndarray result
-
-    n_indexer0 = indexer0.shape[0]
-    n_indexer1 = indexer1.shape[0]
-    n_indexer2 = indexer2.shape[0]
-    n_blocks   = len(values)
-    tup_size   = n_blocks+3
-    result = np.empty(n_indexer0*n_indexer1*n_indexer2,dtype=dtype)
-    l = 0
-    for i from 0 <= i < n_indexer0:
-
-        for j from 0 <= j < n_indexer1:
-
-            for k from 0 <= k < n_indexer2:
-
-                if not mask[i, j, k]:
-
-                    tup = PyTuple_New(tup_size)
-
-                    v = indexer0[i]
-                    PyTuple_SET_ITEM(tup, 0, v)
-                    Py_INCREF(v)
-                    v = indexer1[j]
-                    PyTuple_SET_ITEM(tup, 1, v)
-                    Py_INCREF(v)
-                    v = indexer2[k]
-                    PyTuple_SET_ITEM(tup, 2, v)
-                    Py_INCREF(v)
-
-                    for b from 0 <= b < n_blocks:
-
-                        v   = values[b][i, j, k]
-                        if searchable[b]:
-                            v = v[0]
-                        PyTuple_SET_ITEM(tup, b+3, v)
-                        Py_INCREF(v)
-
-                    result[l] = tup
-                    l += 1
-
-    return result[0:l]
-
 #-------------------------------------------------------------------------------
 # Groupby-related functions