Skip to content

BUG: bug when using chunksize and writing ndim > 2 #4462

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 5, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 35 additions & 29 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -3037,7 +3037,11 @@ def write(self, obj, axes=None, append=False, complib=None,
self.write_data(chunksize)

def write_data(self, chunksize):
""" fast writing of data: requires specific cython routines each axis shape """
""" we form the data into a 2-d including indexes,values,mask
write chunk-by-chunk """

names = self.dtype.names
nrows = self.nrows_expected

# create the masks & values
masks = []
Expand All @@ -3052,30 +3056,49 @@ def write_data(self, chunksize):
mask = masks[0]
for m in masks[1:]:
mask = mask & m
mask = mask.ravel()

# broadcast the indexes if needed
indexes = [ a.cvalues for a in self.index_axes ]
nindexes = len(indexes)
bindexes = []
for i, idx in enumerate(indexes):

# broadcast to all other indexes except myself
if i > 0 and i < nindexes:
repeater = np.prod([indexes[bi].shape[0] for bi in range(0,i)])
idx = np.tile(idx,repeater)

# the arguments
indexes = [a.cvalues for a in self.index_axes]
values = [a.take_data() for a in self.values_axes]
if i < nindexes-1:
repeater = np.prod([indexes[bi].shape[0] for bi in range(i+1,nindexes)])
idx = np.repeat(idx,repeater)

bindexes.append(idx)

# transpose the values so first dimension is last
# reshape the values if needed
values = [ a.take_data() for a in self.values_axes]
values = [ v.transpose(np.roll(np.arange(v.ndim),v.ndim-1)) for v in values ]
bvalues = []
for i, v in enumerate(values):
new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
bvalues.append(values[i].ravel().reshape(new_shape))

# write the chunks
if chunksize is None:
chunksize = 100000

rows = self.nrows_expected
chunks = int(rows / chunksize) + 1
chunks = int(nrows / chunksize) + 1
for i in range(chunks):
start_i = i * chunksize
end_i = min((i + 1) * chunksize, rows)
end_i = min((i + 1) * chunksize, nrows)
if start_i >= end_i:
break

self.write_data_chunk(
indexes=[a[start_i:end_i] for a in indexes],
indexes=[a[start_i:end_i] for a in bindexes],
mask=mask[start_i:end_i],
values=[v[start_i:end_i] for v in values])
values=[v[start_i:end_i] for v in bvalues])

def write_data_chunk(self, indexes, mask, values):

Expand All @@ -3085,35 +3108,18 @@ def write_data_chunk(self, indexes, mask, values):
return

try:
nrows = np.prod([ idx.shape[0] for idx in indexes ])
nrows = indexes[0].shape[0]
rows = np.empty(nrows,dtype=self.dtype)
names = self.dtype.names
nindexes = len(indexes)

# indexes
nindexes = len(indexes)
for i, idx in enumerate(indexes):

# broadcast to all other indexes except myself
if i > 0 and i < nindexes:
repeater = np.prod([indexes[bi].shape[0] for bi in range(0,i)])
idx = np.tile(idx,repeater)

if i < nindexes-1:
repeater = np.prod([indexes[bi].shape[0] for bi in range(i+1,nindexes)])
idx = np.repeat(idx,repeater)

rows[names[i]] = idx

# values
for i, v in enumerate(values):
name = names[nindexes + i]
b = values[i]

# reshape
new_shape = (nrows,) + self.dtype[name].shape
b = b.ravel().reshape(new_shape)

rows[name] = b
rows[names[i+nindexes]] = v

# mask
rows = rows[~mask.ravel().astype(bool)]
Expand Down
23 changes: 23 additions & 0 deletions pandas/io/tests/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1237,6 +1237,29 @@ def test_append_misc(self):
result = store.select('df1')
tm.assert_frame_equal(result, df)

# more chunksize in append tests
def check(obj, comparator):
for c in [10, 200, 1000]:
with ensure_clean(self.path,mode='w') as store:
store.append('obj', obj, chunksize=c)
result = store.select('obj')
comparator(result,obj)

df = tm.makeDataFrame()
df['string'] = 'foo'
df['float322'] = 1.
df['float322'] = df['float322'].astype('float32')
df['bool'] = df['float322'] > 0
df['time1'] = Timestamp('20130101')
df['time2'] = Timestamp('20130102')
check(df, tm.assert_frame_equal)

p = tm.makePanel()
check(p, tm.assert_panel_equal)

p4d = tm.makePanel4D()
check(p4d, tm.assert_panel4d_equal)

def test_append_raise(self):

with ensure_clean(self.path) as store:
Expand Down