diff --git a/RELEASE.rst b/RELEASE.rst index f3d9c72db8bc5..f80a688c3657e 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -44,6 +44,7 @@ pandas 0.11.1 - will warn with a FrequencyWarning if you are attempting to append an index with a different frequency than the existing - support datelike columns with a timezone as data_columns (GH2852_) + - table writing performance improvements. **API Changes** diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1661080b11799..834a94a139ee5 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -913,7 +913,7 @@ def __init__(self, func, nrows, start=None, stop=None, chunksize=None): self.stop = min(self.nrows,stop) if chunksize is None: - chunksize = 50000 + chunksize = 100000 self.chunksize = chunksize @@ -2232,6 +2232,10 @@ def table(self): """ return the table group (this is my storable) """ return self.storable + @property + def dtype(self): + return self.table.dtype + @property def description(self): return self.table.description @@ -2848,7 +2852,7 @@ class AppendableTable(LegacyTable): table_type = 'appendable' def write(self, obj, axes=None, append=False, complib=None, - complevel=None, fletcher32=None, min_itemsize=None, chunksize=50000, + complevel=None, fletcher32=None, min_itemsize=None, chunksize=None, expectedrows=None, **kwargs): if not append and self.is_exists: @@ -2905,18 +2909,26 @@ def write_data(self, chunksize): [a.is_searchable for a in self.values_axes]).astype('u1') values = [a.take_data() for a in self.values_axes] + # transpose the values so first dimension is last + values = [ v.transpose(np.roll(np.arange(v.ndim),v.ndim-1)) for v in values ] + # write the chunks + if chunksize is None: + chunksize = 100000 + rows = self.nrows_expected chunks = int(rows / chunksize) + 1 for i in xrange(chunks): start_i = i * chunksize end_i = min((i + 1) * chunksize, rows) + if start_i >= end_i: + break self.write_data_chunk( indexes=[a[start_i:end_i] for a in indexes], mask=mask[start_i:end_i], search=search, - values=[v[:, start_i:end_i] for v in values]) + values=[v[start_i:end_i] for v in values]) def write_data_chunk(self, indexes, mask, search, values): @@ -2929,7 +2941,7 @@ def write_data_chunk(self, indexes, mask, search, values): try: func = getattr(lib, "create_hdf_rows_%sd" % self.ndim) args = list(indexes) - args.extend([mask, search, values]) + args.extend([self.dtype, mask, search, values]) rows = func(*args) except (Exception), detail: raise Exception("cannot create row-data -> %s" % str(detail)) @@ -2939,9 +2951,8 @@ def write_data_chunk(self, indexes, mask, search, values): self.table.append(rows) self.table.flush() except (Exception), detail: - raise Exception( - "tables cannot write this data -> %s" % str(detail)) - + raise Exception("tables cannot write this data -> %s" % str(detail)) + def delete(self, where=None, **kwargs): # delete all rows (and return the nrows) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 05171523764c8..d043691bc061e 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -837,61 +837,70 @@ def write_csv_rows(list data, list data_index, int nlevels, list cols, object wr @cython.boundscheck(False) @cython.wraparound(False) -def create_hdf_rows_2d(ndarray indexer0, +def create_hdf_rows_2d(ndarray indexer0, + object dtype, ndarray[np.uint8_t, ndim=1] mask, ndarray[np.uint8_t, ndim=1] searchable, - list values): + list values): """ return a list of objects ready to be converted to rec-array format """ cdef: - int i, b, n_indexer0, n_blocks, tup_size - list l - object tup, val, v + int i, l, b, n_indexer0, n_blocks, tup_size + ndarray result + tuple tup + object v n_indexer0 = indexer0.shape[0] n_blocks = len(values) tup_size = n_blocks+1 - l = [] - for i from 0 <= i < n_indexer0: + result = np.empty(n_indexer0,dtype=dtype) + l = 0 + for i in range(n_indexer0): if not mask[i]: - + tup = PyTuple_New(tup_size) - val = indexer0[i] - PyTuple_SET_ITEM(tup, 0, val) - Py_INCREF(val) - for b from 0 <= b < n_blocks: + v = indexer0[i] + PyTuple_SET_ITEM(tup, 0, v) + Py_INCREF(v) + + for b in range(n_blocks): - v = values[b][:, i] + v = values[b][i] if searchable[b]: v = v[0] + PyTuple_SET_ITEM(tup, b+1, v) Py_INCREF(v) - l.append(tup) + result[l] = tup + l += 1 - return l + return result[0:l] @cython.boundscheck(False) @cython.wraparound(False) def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1, + object dtype, ndarray[np.uint8_t, ndim=2] mask, ndarray[np.uint8_t, ndim=1] searchable, list values): """ return a list of objects ready to be converted to rec-array format """ cdef: - int i, j, b, n_indexer0, n_indexer1, n_blocks, tup_size - list l - object tup, val, v + int i, j, l, b, n_indexer0, n_indexer1, n_blocks, tup_size + tuple tup + object v + ndarray result n_indexer0 = indexer0.shape[0] n_indexer1 = indexer1.shape[0] n_blocks = len(values) tup_size = n_blocks+2 - l = [] + result = np.empty(n_indexer0*n_indexer1,dtype=dtype) + l = 0 for i from 0 <= i < n_indexer0: for j from 0 <= j < n_indexer1: @@ -900,45 +909,49 @@ def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1, tup = PyTuple_New(tup_size) - val = indexer0[i] - PyTuple_SET_ITEM(tup, 0, val) - Py_INCREF(val) - - val = indexer1[j] - PyTuple_SET_ITEM(tup, 1, val) - Py_INCREF(val) + v = indexer0[i] + PyTuple_SET_ITEM(tup, 0, v) + Py_INCREF(v) + v = indexer1[j] + PyTuple_SET_ITEM(tup, 1, v) + Py_INCREF(v) for b from 0 <= b < n_blocks: - v = values[b][:, i, j] + v = values[b][i, j] if searchable[b]: v = v[0] + PyTuple_SET_ITEM(tup, b+2, v) Py_INCREF(v) - l.append(tup) + result[l] = tup + l += 1 - return l + return result[0:l] @cython.boundscheck(False) @cython.wraparound(False) def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2, + object dtype, ndarray[np.uint8_t, ndim=3] mask, ndarray[np.uint8_t, ndim=1] searchable, list values): """ return a list of objects ready to be converted to rec-array format """ cdef: - int i, j, k, b, n_indexer0, n_indexer1, n_indexer2, n_blocks, tup_size - list l - object tup, val, v + int i, j, k, l, b, n_indexer0, n_indexer1, n_indexer2, n_blocks, tup_size + tuple tup + object v + ndarray result n_indexer0 = indexer0.shape[0] n_indexer1 = indexer1.shape[0] n_indexer2 = indexer2.shape[0] n_blocks = len(values) tup_size = n_blocks+3 - l = [] + result = np.empty(n_indexer0*n_indexer1*n_indexer2,dtype=dtype) + l = 0 for i from 0 <= i < n_indexer0: for j from 0 <= j < n_indexer1: @@ -949,29 +962,28 @@ def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2, tup = PyTuple_New(tup_size) - val = indexer0[i] - PyTuple_SET_ITEM(tup, 0, val) - Py_INCREF(val) - - val = indexer1[j] - PyTuple_SET_ITEM(tup, 1, val) - Py_INCREF(val) - - val = indexer2[k] - PyTuple_SET_ITEM(tup, 2, val) - Py_INCREF(val) + v = indexer0[i] + PyTuple_SET_ITEM(tup, 0, v) + Py_INCREF(v) + v = indexer1[j] + PyTuple_SET_ITEM(tup, 1, v) + Py_INCREF(v) + v = indexer2[k] + PyTuple_SET_ITEM(tup, 2, v) + Py_INCREF(v) for b from 0 <= b < n_blocks: - v = values[b][:, i, j, k] + v = values[b][i, j, k] if searchable[b]: v = v[0] PyTuple_SET_ITEM(tup, b+3, v) Py_INCREF(v) - l.append(tup) + result[l] = tup + l += 1 - return l + return result[0:l] #------------------------------------------------------------------------------- # Groupby-related functions