Skip to content

PERF: enhance HDFStore Table writing performance #4457

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 5, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ pandas 0.13
``read_table``, ``read_csv``, etc.
- Added a more informative error message when plot arguments contain
overlapping color and style arguments (:issue:`4402`)
- Significant table writing performance improvements in ``HDFStore``

**API Changes**

Expand Down
1 change: 1 addition & 0 deletions doc/source/v0.13.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ API changes
- ``Series.get`` with negative indexers now returns the same as ``[]`` (:issue:`4390`)
- ``HDFStore``

- Significant table writing performance improvements
- added an ``is_open`` property to indicate if the underlying file handle is_open;
a closed store will now report 'CLOSED' when viewing the store (rather than raising an error)
(:issue:`4409`)
Expand Down
49 changes: 34 additions & 15 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1037,7 +1037,6 @@ class IndexCol(StringMixin):
"""
is_an_indexable = True
is_data_indexable = True
is_searchable = False
_info_fields = ['freq','tz','index_name']

def __init__(self, values=None, kind=None, typ=None, cname=None, itemsize=None,
Expand Down Expand Up @@ -1299,7 +1298,6 @@ class DataCol(IndexCol):
"""
is_an_indexable = False
is_data_indexable = False
is_searchable = False
_info_fields = ['tz']

@classmethod
Expand Down Expand Up @@ -1588,10 +1586,6 @@ class DataIndexableCol(DataCol):
""" represent a data column that can be indexed """
is_data_indexable = True

@property
def is_searchable(self):
return _ensure_decoded(self.kind) == u('string')

def get_atom_string(self, block, itemsize):
return _tables().StringCol(itemsize=itemsize)

Expand Down Expand Up @@ -3061,8 +3055,6 @@ def write_data(self, chunksize):

# the arguments
indexes = [a.cvalues for a in self.index_axes]
search = np.array(
[a.is_searchable for a in self.values_axes]).astype('u1')
values = [a.take_data() for a in self.values_axes]

# transpose the values so first dimension is last
Expand All @@ -3083,22 +3075,49 @@ def write_data(self, chunksize):
self.write_data_chunk(
indexes=[a[start_i:end_i] for a in indexes],
mask=mask[start_i:end_i],
search=search,
values=[v[start_i:end_i] for v in values])

def write_data_chunk(self, indexes, mask, search, values):
def write_data_chunk(self, indexes, mask, values):

# 0 len
for v in values:
if not np.prod(v.shape):
return

# get our function
try:
func = getattr(lib, "create_hdf_rows_%sd" % self.ndim)
args = list(indexes)
args.extend([self.dtype, mask, search, values])
rows = func(*args)
nrows = np.prod([ idx.shape[0] for idx in indexes ])
rows = np.empty(nrows,dtype=self.dtype)
names = self.dtype.names

# indexes
nindexes = len(indexes)
for i, idx in enumerate(indexes):

# broadcast to all other indexes except myself
if i > 0 and i < nindexes:
repeater = np.prod([indexes[bi].shape[0] for bi in range(0,i)])
idx = np.tile(idx,repeater)

if i < nindexes-1:
repeater = np.prod([indexes[bi].shape[0] for bi in range(i+1,nindexes)])
idx = np.repeat(idx,repeater)

rows[names[i]] = idx

# values
for i, v in enumerate(values):
name = names[nindexes + i]
b = values[i]

# reshape
new_shape = (nrows,) + self.dtype[name].shape
b = b.ravel().reshape(new_shape)

rows[name] = b

# mask
rows = rows[~mask.ravel().astype(bool)]

except Exception as detail:
raise Exception("cannot create row-data -> %s" % str(detail))

Expand Down
151 changes: 0 additions & 151 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -842,157 +842,6 @@ def write_csv_rows(list data, list data_index, int nlevels, list cols, object wr
if j >= 0 and (j < N-1 or (j % N) != N-1 ):
writer.writerows(rows[:((j+1) % N)])


@cython.boundscheck(False)
@cython.wraparound(False)
def create_hdf_rows_2d(ndarray indexer0,
object dtype,
ndarray[np.uint8_t, ndim=1] mask,
ndarray[np.uint8_t, ndim=1] searchable,
list values):
""" return a list of objects ready to be converted to rec-array format """

cdef:
int i, l, b, n_indexer0, n_blocks, tup_size
ndarray result
tuple tup
object v

n_indexer0 = indexer0.shape[0]
n_blocks = len(values)
tup_size = n_blocks+1

result = np.empty(n_indexer0,dtype=dtype)
l = 0
for i in range(n_indexer0):

if not mask[i]:

tup = PyTuple_New(tup_size)

v = indexer0[i]
PyTuple_SET_ITEM(tup, 0, v)
Py_INCREF(v)

for b in range(n_blocks):

v = values[b][i]
if searchable[b]:
v = v[0]

PyTuple_SET_ITEM(tup, b+1, v)
Py_INCREF(v)

result[l] = tup
l += 1

return result[0:l]

@cython.boundscheck(False)
@cython.wraparound(False)
def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1,
object dtype,
ndarray[np.uint8_t, ndim=2] mask,
ndarray[np.uint8_t, ndim=1] searchable,
list values):
""" return a list of objects ready to be converted to rec-array format """

cdef:
int i, j, l, b, n_indexer0, n_indexer1, n_blocks, tup_size
tuple tup
object v
ndarray result

n_indexer0 = indexer0.shape[0]
n_indexer1 = indexer1.shape[0]
n_blocks = len(values)
tup_size = n_blocks+2
result = np.empty(n_indexer0*n_indexer1,dtype=dtype)
l = 0
for i from 0 <= i < n_indexer0:

for j from 0 <= j < n_indexer1:

if not mask[i, j]:

tup = PyTuple_New(tup_size)

v = indexer0[i]
PyTuple_SET_ITEM(tup, 0, v)
Py_INCREF(v)
v = indexer1[j]
PyTuple_SET_ITEM(tup, 1, v)
Py_INCREF(v)

for b from 0 <= b < n_blocks:

v = values[b][i, j]
if searchable[b]:
v = v[0]

PyTuple_SET_ITEM(tup, b+2, v)
Py_INCREF(v)

result[l] = tup
l += 1

return result[0:l]

@cython.boundscheck(False)
@cython.wraparound(False)
def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2,
object dtype,
ndarray[np.uint8_t, ndim=3] mask,
ndarray[np.uint8_t, ndim=1] searchable,
list values):
""" return a list of objects ready to be converted to rec-array format """

cdef:
int i, j, k, l, b, n_indexer0, n_indexer1, n_indexer2, n_blocks, tup_size
tuple tup
object v
ndarray result

n_indexer0 = indexer0.shape[0]
n_indexer1 = indexer1.shape[0]
n_indexer2 = indexer2.shape[0]
n_blocks = len(values)
tup_size = n_blocks+3
result = np.empty(n_indexer0*n_indexer1*n_indexer2,dtype=dtype)
l = 0
for i from 0 <= i < n_indexer0:

for j from 0 <= j < n_indexer1:

for k from 0 <= k < n_indexer2:

if not mask[i, j, k]:

tup = PyTuple_New(tup_size)

v = indexer0[i]
PyTuple_SET_ITEM(tup, 0, v)
Py_INCREF(v)
v = indexer1[j]
PyTuple_SET_ITEM(tup, 1, v)
Py_INCREF(v)
v = indexer2[k]
PyTuple_SET_ITEM(tup, 2, v)
Py_INCREF(v)

for b from 0 <= b < n_blocks:

v = values[b][i, j, k]
if searchable[b]:
v = v[0]
PyTuple_SET_ITEM(tup, b+3, v)
Py_INCREF(v)

result[l] = tup
l += 1

return result[0:l]

#-------------------------------------------------------------------------------
# Groupby-related functions

Expand Down