From 6c2197dfe0bf662bab234dbc8ce84e8427d8b651 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 3 Aug 2013 12:56:16 -0400 Subject: [PATCH] PERF: enhance HDFStore Table writing performance CLN: removed is_searchable (no longer needed) --- doc/source/release.rst | 1 + doc/source/v0.13.0.txt | 1 + pandas/io/pytables.py | 49 +++++++++---- pandas/lib.pyx | 151 ----------------------------------------- 4 files changed, 36 insertions(+), 166 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index ddf0ecfc52d61..473e1792cb0d0 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -46,6 +46,7 @@ pandas 0.13 ``read_table``, ``read_csv``, etc. - Added a more informative error message when plot arguments contain overlapping color and style arguments (:issue:`4402`) + - Significant table writing performance improvements in ``HDFStore`` **API Changes** diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 320b91969846d..0e2432a8b2b10 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -32,6 +32,7 @@ API changes - ``Series.get`` with negative indexers now returns the same as ``[]`` (:issue:`4390`) - ``HDFStore`` + - Significant table writing performance improvements - added an ``is_open`` property to indicate if the underlying file handle is_open; a closed store will now report 'CLOSED' when viewing the store (rather than raising an error) (:issue:`4409`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4eae54b5dc85e..0f429234ba3dc 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1037,7 +1037,6 @@ class IndexCol(StringMixin): """ is_an_indexable = True is_data_indexable = True - is_searchable = False _info_fields = ['freq','tz','index_name'] def __init__(self, values=None, kind=None, typ=None, cname=None, itemsize=None, @@ -1299,7 +1298,6 @@ class DataCol(IndexCol): """ is_an_indexable = False is_data_indexable = False - is_searchable = False _info_fields = ['tz'] @classmethod @@ -1588,10 +1586,6 @@ class DataIndexableCol(DataCol): """ represent a data column that can be indexed """ is_data_indexable = True - @property - def is_searchable(self): - return _ensure_decoded(self.kind) == u('string') - def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize) @@ -3061,8 +3055,6 @@ def write_data(self, chunksize): # the arguments indexes = [a.cvalues for a in self.index_axes] - search = np.array( - [a.is_searchable for a in self.values_axes]).astype('u1') values = [a.take_data() for a in self.values_axes] # transpose the values so first dimension is last @@ -3083,22 +3075,49 @@ def write_data(self, chunksize): self.write_data_chunk( indexes=[a[start_i:end_i] for a in indexes], mask=mask[start_i:end_i], - search=search, values=[v[start_i:end_i] for v in values]) - def write_data_chunk(self, indexes, mask, search, values): + def write_data_chunk(self, indexes, mask, values): # 0 len for v in values: if not np.prod(v.shape): return - # get our function try: - func = getattr(lib, "create_hdf_rows_%sd" % self.ndim) - args = list(indexes) - args.extend([self.dtype, mask, search, values]) - rows = func(*args) + nrows = np.prod([ idx.shape[0] for idx in indexes ]) + rows = np.empty(nrows,dtype=self.dtype) + names = self.dtype.names + + # indexes + nindexes = len(indexes) + for i, idx in enumerate(indexes): + + # broadcast to all other indexes except myself + if i > 0 and i < nindexes: + repeater = np.prod([indexes[bi].shape[0] for bi in range(0,i)]) + idx = np.tile(idx,repeater) + + if i < nindexes-1: + repeater = np.prod([indexes[bi].shape[0] for bi in range(i+1,nindexes)]) + idx = np.repeat(idx,repeater) + + rows[names[i]] = idx + + # values + for i, v in enumerate(values): + name = names[nindexes + i] + b = values[i] + + # reshape + new_shape = (nrows,) + self.dtype[name].shape + b = b.ravel().reshape(new_shape) + + rows[name] = b + + # mask + rows = rows[~mask.ravel().astype(bool)] + except Exception as detail: raise Exception("cannot create row-data -> %s" % str(detail)) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 031f2c56deb13..7c4ba1cda35eb 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -842,157 +842,6 @@ def write_csv_rows(list data, list data_index, int nlevels, list cols, object wr if j >= 0 and (j < N-1 or (j % N) != N-1 ): writer.writerows(rows[:((j+1) % N)]) - -@cython.boundscheck(False) -@cython.wraparound(False) -def create_hdf_rows_2d(ndarray indexer0, - object dtype, - ndarray[np.uint8_t, ndim=1] mask, - ndarray[np.uint8_t, ndim=1] searchable, - list values): - """ return a list of objects ready to be converted to rec-array format """ - - cdef: - int i, l, b, n_indexer0, n_blocks, tup_size - ndarray result - tuple tup - object v - - n_indexer0 = indexer0.shape[0] - n_blocks = len(values) - tup_size = n_blocks+1 - - result = np.empty(n_indexer0,dtype=dtype) - l = 0 - for i in range(n_indexer0): - - if not mask[i]: - - tup = PyTuple_New(tup_size) - - v = indexer0[i] - PyTuple_SET_ITEM(tup, 0, v) - Py_INCREF(v) - - for b in range(n_blocks): - - v = values[b][i] - if searchable[b]: - v = v[0] - - PyTuple_SET_ITEM(tup, b+1, v) - Py_INCREF(v) - - result[l] = tup - l += 1 - - return result[0:l] - -@cython.boundscheck(False) -@cython.wraparound(False) -def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1, - object dtype, - ndarray[np.uint8_t, ndim=2] mask, - ndarray[np.uint8_t, ndim=1] searchable, - list values): - """ return a list of objects ready to be converted to rec-array format """ - - cdef: - int i, j, l, b, n_indexer0, n_indexer1, n_blocks, tup_size - tuple tup - object v - ndarray result - - n_indexer0 = indexer0.shape[0] - n_indexer1 = indexer1.shape[0] - n_blocks = len(values) - tup_size = n_blocks+2 - result = np.empty(n_indexer0*n_indexer1,dtype=dtype) - l = 0 - for i from 0 <= i < n_indexer0: - - for j from 0 <= j < n_indexer1: - - if not mask[i, j]: - - tup = PyTuple_New(tup_size) - - v = indexer0[i] - PyTuple_SET_ITEM(tup, 0, v) - Py_INCREF(v) - v = indexer1[j] - PyTuple_SET_ITEM(tup, 1, v) - Py_INCREF(v) - - for b from 0 <= b < n_blocks: - - v = values[b][i, j] - if searchable[b]: - v = v[0] - - PyTuple_SET_ITEM(tup, b+2, v) - Py_INCREF(v) - - result[l] = tup - l += 1 - - return result[0:l] - -@cython.boundscheck(False) -@cython.wraparound(False) -def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2, - object dtype, - ndarray[np.uint8_t, ndim=3] mask, - ndarray[np.uint8_t, ndim=1] searchable, - list values): - """ return a list of objects ready to be converted to rec-array format """ - - cdef: - int i, j, k, l, b, n_indexer0, n_indexer1, n_indexer2, n_blocks, tup_size - tuple tup - object v - ndarray result - - n_indexer0 = indexer0.shape[0] - n_indexer1 = indexer1.shape[0] - n_indexer2 = indexer2.shape[0] - n_blocks = len(values) - tup_size = n_blocks+3 - result = np.empty(n_indexer0*n_indexer1*n_indexer2,dtype=dtype) - l = 0 - for i from 0 <= i < n_indexer0: - - for j from 0 <= j < n_indexer1: - - for k from 0 <= k < n_indexer2: - - if not mask[i, j, k]: - - tup = PyTuple_New(tup_size) - - v = indexer0[i] - PyTuple_SET_ITEM(tup, 0, v) - Py_INCREF(v) - v = indexer1[j] - PyTuple_SET_ITEM(tup, 1, v) - Py_INCREF(v) - v = indexer2[k] - PyTuple_SET_ITEM(tup, 2, v) - Py_INCREF(v) - - for b from 0 <= b < n_blocks: - - v = values[b][i, j, k] - if searchable[b]: - v = v[0] - PyTuple_SET_ITEM(tup, b+3, v) - Py_INCREF(v) - - result[l] = tup - l += 1 - - return result[0:l] - #------------------------------------------------------------------------------- # Groupby-related functions