From 3a979acb80fb3a8b2872ca2f4d183e794290227e Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 5 Aug 2013 09:11:00 -0400 Subject: [PATCH] BUG: bug when using chunksize and writing ndim > 2 --- pandas/io/pytables.py | 64 +++++++++++++++++--------------- pandas/io/tests/test_pytables.py | 23 ++++++++++++ 2 files changed, 58 insertions(+), 29 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0f429234ba3dc..2f0374e60c955 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3037,7 +3037,11 @@ def write(self, obj, axes=None, append=False, complib=None, self.write_data(chunksize) def write_data(self, chunksize): - """ fast writing of data: requires specific cython routines each axis shape """ + """ we form the data into a 2-d including indexes,values,mask + write chunk-by-chunk """ + + names = self.dtype.names + nrows = self.nrows_expected # create the masks & values masks = [] @@ -3052,30 +3056,49 @@ def write_data(self, chunksize): mask = masks[0] for m in masks[1:]: mask = mask & m + mask = mask.ravel() + + # broadcast the indexes if needed + indexes = [ a.cvalues for a in self.index_axes ] + nindexes = len(indexes) + bindexes = [] + for i, idx in enumerate(indexes): + + # broadcast to all other indexes except myself + if i > 0 and i < nindexes: + repeater = np.prod([indexes[bi].shape[0] for bi in range(0,i)]) + idx = np.tile(idx,repeater) - # the arguments - indexes = [a.cvalues for a in self.index_axes] - values = [a.take_data() for a in self.values_axes] + if i < nindexes-1: + repeater = np.prod([indexes[bi].shape[0] for bi in range(i+1,nindexes)]) + idx = np.repeat(idx,repeater) + + bindexes.append(idx) # transpose the values so first dimension is last + # reshape the values if needed + values = [ a.take_data() for a in self.values_axes] values = [ v.transpose(np.roll(np.arange(v.ndim),v.ndim-1)) for v in values ] + bvalues = [] + for i, v in enumerate(values): + new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape + bvalues.append(values[i].ravel().reshape(new_shape)) # write the chunks if chunksize is None: chunksize = 100000 - rows = self.nrows_expected - chunks = int(rows / chunksize) + 1 + chunks = int(nrows / chunksize) + 1 for i in range(chunks): start_i = i * chunksize - end_i = min((i + 1) * chunksize, rows) + end_i = min((i + 1) * chunksize, nrows) if start_i >= end_i: break self.write_data_chunk( - indexes=[a[start_i:end_i] for a in indexes], + indexes=[a[start_i:end_i] for a in bindexes], mask=mask[start_i:end_i], - values=[v[start_i:end_i] for v in values]) + values=[v[start_i:end_i] for v in bvalues]) def write_data_chunk(self, indexes, mask, values): @@ -3085,35 +3108,18 @@ def write_data_chunk(self, indexes, mask, values): return try: - nrows = np.prod([ idx.shape[0] for idx in indexes ]) + nrows = indexes[0].shape[0] rows = np.empty(nrows,dtype=self.dtype) names = self.dtype.names + nindexes = len(indexes) # indexes - nindexes = len(indexes) for i, idx in enumerate(indexes): - - # broadcast to all other indexes except myself - if i > 0 and i < nindexes: - repeater = np.prod([indexes[bi].shape[0] for bi in range(0,i)]) - idx = np.tile(idx,repeater) - - if i < nindexes-1: - repeater = np.prod([indexes[bi].shape[0] for bi in range(i+1,nindexes)]) - idx = np.repeat(idx,repeater) - rows[names[i]] = idx # values for i, v in enumerate(values): - name = names[nindexes + i] - b = values[i] - - # reshape - new_shape = (nrows,) + self.dtype[name].shape - b = b.ravel().reshape(new_shape) - - rows[name] = b + rows[names[i+nindexes]] = v # mask rows = rows[~mask.ravel().astype(bool)] diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index c2564a6e12145..a5c4cb49bead8 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1237,6 +1237,29 @@ def test_append_misc(self): result = store.select('df1') tm.assert_frame_equal(result, df) + # more chunksize in append tests + def check(obj, comparator): + for c in [10, 200, 1000]: + with ensure_clean(self.path,mode='w') as store: + store.append('obj', obj, chunksize=c) + result = store.select('obj') + comparator(result,obj) + + df = tm.makeDataFrame() + df['string'] = 'foo' + df['float322'] = 1. + df['float322'] = df['float322'].astype('float32') + df['bool'] = df['float322'] > 0 + df['time1'] = Timestamp('20130101') + df['time2'] = Timestamp('20130102') + check(df, tm.assert_frame_equal) + + p = tm.makePanel() + check(p, tm.assert_panel_equal) + + p4d = tm.makePanel4D() + check(p4d, tm.assert_panel4d_equal) + def test_append_raise(self): with ensure_clean(self.path) as store: