Open
Description
Problem Description
compared to the legacy pandas 0.2x.x, the most recent 1.0.1 version doesnt perform well in HDF5 file loading unless the conditioned column was sorted beforehand
import os
import numpy as np
import pandas as pd
row = 6000
col = 2
start_date = '20000101'
freq = 'B'
ind = pd.date_range(start_date,periods=row,freq=freq)
a = pd.DataFrame(np.random.rand(row*col).reshape((row,-1)),index=ind)
a = a.reset_index(drop=False)
a.columns = ['date','x','y']
a = pd.concat([a]*2000)
a.sort_values('y').to_hdf(os.path.join(os.path.expanduser('~'),'hdf_test.h5'),key='df',data_columns=True,format='table')
a.sort_values('date').to_hdf(os.path.join(os.path.expanduser('~'),'hdf_test_sorted.h5'),key='df',data_columns=True,format='table')
# significantly slower in pandas 1.0.1 (this is not an issue in pandas 0.2x.x)
store = pd.HDFStore(os.path.join(os.path.expanduser('~'),'hdf_test.h5'),mode='r')
result = store.select('df',where="{0}>=20150101 & {0}<=20160730".format('date'),)
store.close()
# as fast as pandas 0.2x.x if data was sorted on the 'term' column
store = pd.HDFStore(os.path.join(os.path.expanduser('~'),'hdf_test_sorted.h5'),mode='r')
result = store.select('df',where="{0}>=20150101 & {0}<=20160730".format('date'),)
store.close()