Skip to content

Commit 8915ce6

Browse files
committed
Merge pull request #4457 from jreback/hdf_opt
PERF: enhance HDFStore Table writing performance
2 parents 442b7ee + 6c2197d commit 8915ce6

File tree

4 files changed

+36
-166
lines changed

4 files changed

+36
-166
lines changed

doc/source/release.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ pandas 0.13
4646
``read_table``, ``read_csv``, etc.
4747
- Added a more informative error message when plot arguments contain
4848
overlapping color and style arguments (:issue:`4402`)
49+
- Significant table writing performance improvements in ``HDFStore``
4950

5051
**API Changes**
5152

doc/source/v0.13.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ API changes
3232
- ``Series.get`` with negative indexers now returns the same as ``[]`` (:issue:`4390`)
3333
- ``HDFStore``
3434

35+
- Significant table writing performance improvements
3536
- added an ``is_open`` property to indicate if the underlying file handle is_open;
3637
a closed store will now report 'CLOSED' when viewing the store (rather than raising an error)
3738
(:issue:`4409`)

pandas/io/pytables.py

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,7 +1037,6 @@ class IndexCol(StringMixin):
10371037
"""
10381038
is_an_indexable = True
10391039
is_data_indexable = True
1040-
is_searchable = False
10411040
_info_fields = ['freq','tz','index_name']
10421041

10431042
def __init__(self, values=None, kind=None, typ=None, cname=None, itemsize=None,
@@ -1299,7 +1298,6 @@ class DataCol(IndexCol):
12991298
"""
13001299
is_an_indexable = False
13011300
is_data_indexable = False
1302-
is_searchable = False
13031301
_info_fields = ['tz']
13041302

13051303
@classmethod
@@ -1588,10 +1586,6 @@ class DataIndexableCol(DataCol):
15881586
""" represent a data column that can be indexed """
15891587
is_data_indexable = True
15901588

1591-
@property
1592-
def is_searchable(self):
1593-
return _ensure_decoded(self.kind) == u('string')
1594-
15951589
def get_atom_string(self, block, itemsize):
15961590
return _tables().StringCol(itemsize=itemsize)
15971591

@@ -3061,8 +3055,6 @@ def write_data(self, chunksize):
30613055

30623056
# the arguments
30633057
indexes = [a.cvalues for a in self.index_axes]
3064-
search = np.array(
3065-
[a.is_searchable for a in self.values_axes]).astype('u1')
30663058
values = [a.take_data() for a in self.values_axes]
30673059

30683060
# transpose the values so first dimension is last
@@ -3083,22 +3075,49 @@ def write_data(self, chunksize):
30833075
self.write_data_chunk(
30843076
indexes=[a[start_i:end_i] for a in indexes],
30853077
mask=mask[start_i:end_i],
3086-
search=search,
30873078
values=[v[start_i:end_i] for v in values])
30883079

3089-
def write_data_chunk(self, indexes, mask, search, values):
3080+
def write_data_chunk(self, indexes, mask, values):
30903081

30913082
# 0 len
30923083
for v in values:
30933084
if not np.prod(v.shape):
30943085
return
30953086

3096-
# get our function
30973087
try:
3098-
func = getattr(lib, "create_hdf_rows_%sd" % self.ndim)
3099-
args = list(indexes)
3100-
args.extend([self.dtype, mask, search, values])
3101-
rows = func(*args)
3088+
nrows = np.prod([ idx.shape[0] for idx in indexes ])
3089+
rows = np.empty(nrows,dtype=self.dtype)
3090+
names = self.dtype.names
3091+
3092+
# indexes
3093+
nindexes = len(indexes)
3094+
for i, idx in enumerate(indexes):
3095+
3096+
# broadcast to all other indexes except myself
3097+
if i > 0 and i < nindexes:
3098+
repeater = np.prod([indexes[bi].shape[0] for bi in range(0,i)])
3099+
idx = np.tile(idx,repeater)
3100+
3101+
if i < nindexes-1:
3102+
repeater = np.prod([indexes[bi].shape[0] for bi in range(i+1,nindexes)])
3103+
idx = np.repeat(idx,repeater)
3104+
3105+
rows[names[i]] = idx
3106+
3107+
# values
3108+
for i, v in enumerate(values):
3109+
name = names[nindexes + i]
3110+
b = values[i]
3111+
3112+
# reshape
3113+
new_shape = (nrows,) + self.dtype[name].shape
3114+
b = b.ravel().reshape(new_shape)
3115+
3116+
rows[name] = b
3117+
3118+
# mask
3119+
rows = rows[~mask.ravel().astype(bool)]
3120+
31023121
except Exception as detail:
31033122
raise Exception("cannot create row-data -> %s" % str(detail))
31043123

pandas/lib.pyx

Lines changed: 0 additions & 151 deletions
Original file line numberDiff line numberDiff line change
@@ -842,157 +842,6 @@ def write_csv_rows(list data, list data_index, int nlevels, list cols, object wr
842842
if j >= 0 and (j < N-1 or (j % N) != N-1 ):
843843
writer.writerows(rows[:((j+1) % N)])
844844

845-
846-
@cython.boundscheck(False)
847-
@cython.wraparound(False)
848-
def create_hdf_rows_2d(ndarray indexer0,
849-
object dtype,
850-
ndarray[np.uint8_t, ndim=1] mask,
851-
ndarray[np.uint8_t, ndim=1] searchable,
852-
list values):
853-
""" return a list of objects ready to be converted to rec-array format """
854-
855-
cdef:
856-
int i, l, b, n_indexer0, n_blocks, tup_size
857-
ndarray result
858-
tuple tup
859-
object v
860-
861-
n_indexer0 = indexer0.shape[0]
862-
n_blocks = len(values)
863-
tup_size = n_blocks+1
864-
865-
result = np.empty(n_indexer0,dtype=dtype)
866-
l = 0
867-
for i in range(n_indexer0):
868-
869-
if not mask[i]:
870-
871-
tup = PyTuple_New(tup_size)
872-
873-
v = indexer0[i]
874-
PyTuple_SET_ITEM(tup, 0, v)
875-
Py_INCREF(v)
876-
877-
for b in range(n_blocks):
878-
879-
v = values[b][i]
880-
if searchable[b]:
881-
v = v[0]
882-
883-
PyTuple_SET_ITEM(tup, b+1, v)
884-
Py_INCREF(v)
885-
886-
result[l] = tup
887-
l += 1
888-
889-
return result[0:l]
890-
891-
@cython.boundscheck(False)
892-
@cython.wraparound(False)
893-
def create_hdf_rows_3d(ndarray indexer0, ndarray indexer1,
894-
object dtype,
895-
ndarray[np.uint8_t, ndim=2] mask,
896-
ndarray[np.uint8_t, ndim=1] searchable,
897-
list values):
898-
""" return a list of objects ready to be converted to rec-array format """
899-
900-
cdef:
901-
int i, j, l, b, n_indexer0, n_indexer1, n_blocks, tup_size
902-
tuple tup
903-
object v
904-
ndarray result
905-
906-
n_indexer0 = indexer0.shape[0]
907-
n_indexer1 = indexer1.shape[0]
908-
n_blocks = len(values)
909-
tup_size = n_blocks+2
910-
result = np.empty(n_indexer0*n_indexer1,dtype=dtype)
911-
l = 0
912-
for i from 0 <= i < n_indexer0:
913-
914-
for j from 0 <= j < n_indexer1:
915-
916-
if not mask[i, j]:
917-
918-
tup = PyTuple_New(tup_size)
919-
920-
v = indexer0[i]
921-
PyTuple_SET_ITEM(tup, 0, v)
922-
Py_INCREF(v)
923-
v = indexer1[j]
924-
PyTuple_SET_ITEM(tup, 1, v)
925-
Py_INCREF(v)
926-
927-
for b from 0 <= b < n_blocks:
928-
929-
v = values[b][i, j]
930-
if searchable[b]:
931-
v = v[0]
932-
933-
PyTuple_SET_ITEM(tup, b+2, v)
934-
Py_INCREF(v)
935-
936-
result[l] = tup
937-
l += 1
938-
939-
return result[0:l]
940-
941-
@cython.boundscheck(False)
942-
@cython.wraparound(False)
943-
def create_hdf_rows_4d(ndarray indexer0, ndarray indexer1, ndarray indexer2,
944-
object dtype,
945-
ndarray[np.uint8_t, ndim=3] mask,
946-
ndarray[np.uint8_t, ndim=1] searchable,
947-
list values):
948-
""" return a list of objects ready to be converted to rec-array format """
949-
950-
cdef:
951-
int i, j, k, l, b, n_indexer0, n_indexer1, n_indexer2, n_blocks, tup_size
952-
tuple tup
953-
object v
954-
ndarray result
955-
956-
n_indexer0 = indexer0.shape[0]
957-
n_indexer1 = indexer1.shape[0]
958-
n_indexer2 = indexer2.shape[0]
959-
n_blocks = len(values)
960-
tup_size = n_blocks+3
961-
result = np.empty(n_indexer0*n_indexer1*n_indexer2,dtype=dtype)
962-
l = 0
963-
for i from 0 <= i < n_indexer0:
964-
965-
for j from 0 <= j < n_indexer1:
966-
967-
for k from 0 <= k < n_indexer2:
968-
969-
if not mask[i, j, k]:
970-
971-
tup = PyTuple_New(tup_size)
972-
973-
v = indexer0[i]
974-
PyTuple_SET_ITEM(tup, 0, v)
975-
Py_INCREF(v)
976-
v = indexer1[j]
977-
PyTuple_SET_ITEM(tup, 1, v)
978-
Py_INCREF(v)
979-
v = indexer2[k]
980-
PyTuple_SET_ITEM(tup, 2, v)
981-
Py_INCREF(v)
982-
983-
for b from 0 <= b < n_blocks:
984-
985-
v = values[b][i, j, k]
986-
if searchable[b]:
987-
v = v[0]
988-
PyTuple_SET_ITEM(tup, b+3, v)
989-
Py_INCREF(v)
990-
991-
result[l] = tup
992-
l += 1
993-
994-
return result[0:l]
995-
996845
#-------------------------------------------------------------------------------
997846
# Groupby-related functions
998847

0 commit comments

Comments
 (0)