Skip to content

Commit b9cc821

Browse files
mroeschkejreback
authored andcommitted
CLN: ASV io benchmarks (#18906)
1 parent 86f78af commit b9cc821

File tree

9 files changed

+312
-358
lines changed

9 files changed

+312
-358
lines changed

asv_bench/benchmarks/io/excel.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import numpy as np
2+
from pandas import DataFrame, date_range, ExcelWriter, read_excel
3+
from pandas.compat import BytesIO
4+
import pandas.util.testing as tm
5+
6+
from ..pandas_vb_common import BaseIO, setup # noqa
7+
8+
9+
class Excel(object):
10+
11+
goal_time = 0.2
12+
params = ['openpyxl', 'xlsxwriter', 'xlwt']
13+
param_names = ['engine']
14+
15+
def setup(self, engine):
16+
N = 2000
17+
C = 5
18+
self.df = DataFrame(np.random.randn(N, C),
19+
columns=['float{}'.format(i) for i in range(C)],
20+
index=date_range('20000101', periods=N, freq='H'))
21+
self.df['object'] = tm.makeStringIndex(N)
22+
self.bio_read = BytesIO()
23+
self.writer_read = ExcelWriter(self.bio_read, engine=engine)
24+
self.df.to_excel(self.writer_read, sheet_name='Sheet1')
25+
self.writer_read.save()
26+
self.bio_read.seek(0)
27+
28+
self.bio_write = BytesIO()
29+
self.bio_write.seek(0)
30+
self.writer_write = ExcelWriter(self.bio_write, engine=engine)
31+
32+
def time_read_excel(self, engine):
33+
read_excel(self.bio_read)
34+
35+
def time_write_excel(self, engine):
36+
self.df.to_excel(self.writer_write, sheet_name='Sheet1')
37+
self.writer_write.save()

asv_bench/benchmarks/hdfstore_bench.py renamed to asv_bench/benchmarks/io/hdf.py

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import numpy as np
2-
from pandas import DataFrame, Panel, date_range, HDFStore
2+
from pandas import DataFrame, Panel, date_range, HDFStore, read_hdf
33
import pandas.util.testing as tm
44

5-
from .pandas_vb_common import BaseIO, setup # noqa
5+
from ..pandas_vb_common import BaseIO, setup # noqa
66

77

8-
class HDF5(BaseIO):
8+
class HDFStoreDataFrame(BaseIO):
99

1010
goal_time = 0.2
1111

@@ -34,9 +34,9 @@ def setup(self):
3434
self.df_dc = DataFrame(np.random.randn(N, 10),
3535
columns=['C%03d' % i for i in range(10)])
3636

37-
self.f = '__test__.h5'
37+
self.fname = '__test__.h5'
3838

39-
self.store = HDFStore(self.f)
39+
self.store = HDFStore(self.fname)
4040
self.store.put('fixed', self.df)
4141
self.store.put('fixed_mixed', self.df_mixed)
4242
self.store.append('table', self.df2)
@@ -46,7 +46,7 @@ def setup(self):
4646

4747
def teardown(self):
4848
self.store.close()
49-
self.remove(self.f)
49+
self.remove(self.fname)
5050

5151
def time_read_store(self):
5252
self.store.get('fixed')
@@ -99,25 +99,48 @@ def time_store_info(self):
9999
self.store.info()
100100

101101

102-
class HDF5Panel(BaseIO):
102+
class HDFStorePanel(BaseIO):
103103

104104
goal_time = 0.2
105105

106106
def setup(self):
107-
self.f = '__test__.h5'
107+
self.fname = '__test__.h5'
108108
self.p = Panel(np.random.randn(20, 1000, 25),
109109
items=['Item%03d' % i for i in range(20)],
110110
major_axis=date_range('1/1/2000', periods=1000),
111111
minor_axis=['E%03d' % i for i in range(25)])
112-
self.store = HDFStore(self.f)
112+
self.store = HDFStore(self.fname)
113113
self.store.append('p1', self.p)
114114

115115
def teardown(self):
116116
self.store.close()
117-
self.remove(self.f)
117+
self.remove(self.fname)
118118

119119
def time_read_store_table_panel(self):
120120
self.store.select('p1')
121121

122122
def time_write_store_table_panel(self):
123123
self.store.append('p2', self.p)
124+
125+
126+
class HDF(BaseIO):
127+
128+
goal_time = 0.2
129+
params = ['table', 'fixed']
130+
param_names = ['format']
131+
132+
def setup(self, format):
133+
self.fname = '__test__.h5'
134+
N = 100000
135+
C = 5
136+
self.df = DataFrame(np.random.randn(N, C),
137+
columns=['float{}'.format(i) for i in range(C)],
138+
index=date_range('20000101', periods=N, freq='H'))
139+
self.df['object'] = tm.makeStringIndex(N)
140+
self.df.to_hdf(self.fname, 'df', format=format)
141+
142+
def time_read_hdf(self, format):
143+
read_hdf(self.fname, 'df')
144+
145+
def time_write_hdf(self, format):
146+
self.df.to_hdf(self.fname, 'df', format=format)

asv_bench/benchmarks/io/msgpack.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import numpy as np
2+
from pandas import DataFrame, date_range, read_msgpack
3+
import pandas.util.testing as tm
4+
5+
from ..pandas_vb_common import BaseIO, setup # noqa
6+
7+
8+
class MSGPack(BaseIO):
9+
10+
goal_time = 0.2
11+
12+
def setup(self):
13+
self.fname = '__test__.msg'
14+
N = 100000
15+
C = 5
16+
self.df = DataFrame(np.random.randn(N, C),
17+
columns=['float{}'.format(i) for i in range(C)],
18+
index=date_range('20000101', periods=N, freq='H'))
19+
self.df['object'] = tm.makeStringIndex(N)
20+
self.df.to_msgpack(self.fname)
21+
22+
def time_read_msgpack(self):
23+
read_msgpack(self.fname)
24+
25+
def time_write_msgpack(self):
26+
self.df.to_msgpack(self.fname)

asv_bench/benchmarks/io/pickle.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import numpy as np
2+
from pandas import DataFrame, date_range, read_pickle
3+
import pandas.util.testing as tm
4+
5+
from ..pandas_vb_common import BaseIO, setup # noqa
6+
7+
8+
class Pickle(BaseIO):
9+
10+
goal_time = 0.2
11+
12+
def setup(self):
13+
self.fname = '__test__.pkl'
14+
N = 100000
15+
C = 5
16+
self.df = DataFrame(np.random.randn(N, C),
17+
columns=['float{}'.format(i) for i in range(C)],
18+
index=date_range('20000101', periods=N, freq='H'))
19+
self.df['object'] = tm.makeStringIndex(N)
20+
self.df.to_pickle(self.fname)
21+
22+
def time_read_pickle(self):
23+
read_pickle(self.fname)
24+
25+
def time_write_pickle(self):
26+
self.df.to_pickle(self.fname)

asv_bench/benchmarks/io/sas.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import os
2+
3+
from pandas import read_sas
4+
5+
6+
class SAS(object):
7+
8+
goal_time = 0.2
9+
params = ['sas7bdat', 'xport']
10+
param_names = ['format']
11+
12+
def setup(self, format):
13+
# Read files that are located in 'pandas/io/tests/sas/data'
14+
files = {'sas7bdat': 'test1.sas7bdat', 'xport': 'paxraw_d_short.xpt'}
15+
file = files[format]
16+
paths = [os.path.dirname(__file__), '..', '..', '..', 'pandas',
17+
'tests', 'io', 'sas', 'data', file]
18+
self.f = os.path.join(*paths)
19+
20+
def time_read_msgpack(self, format):
21+
read_sas(self.f, format=format)

asv_bench/benchmarks/io/sql.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import sqlite3
2+
3+
import numpy as np
4+
import pandas.util.testing as tm
5+
from pandas import DataFrame, date_range, read_sql_query, read_sql_table
6+
from sqlalchemy import create_engine
7+
8+
from ..pandas_vb_common import setup # noqa
9+
10+
11+
class SQL(object):
12+
13+
goal_time = 0.2
14+
params = ['sqlalchemy', 'sqlite']
15+
param_names = ['connection']
16+
17+
def setup(self, connection):
18+
N = 10000
19+
con = {'sqlalchemy': create_engine('sqlite:///:memory:'),
20+
'sqlite': sqlite3.connect(':memory:')}
21+
self.table_name = 'test_type'
22+
self.query_all = 'SELECT * FROM {}'.format(self.table_name)
23+
self.con = con[connection]
24+
self.df = DataFrame({'float': np.random.randn(N),
25+
'float_with_nan': np.random.randn(N),
26+
'string': ['foo'] * N,
27+
'bool': [True] * N,
28+
'int': np.random.randint(0, N, size=N),
29+
'datetime': date_range('2000-01-01',
30+
periods=N,
31+
freq='s')},
32+
index=tm.makeStringIndex(N))
33+
self.df.loc[1000:3000, 'float_with_nan'] = np.nan
34+
self.df['datetime_string'] = self.df['datetime'].astype(str)
35+
self.df.to_sql(self.table_name, self.con, if_exists='replace')
36+
37+
def time_to_sql_dataframe(self, connection):
38+
self.df.to_sql('test1', self.con, if_exists='replace')
39+
40+
def time_read_sql_query(self, connection):
41+
read_sql_query(self.query_all, self.con)
42+
43+
44+
class WriteSQLDtypes(object):
45+
46+
goal_time = 0.2
47+
params = (['sqlalchemy', 'sqlite'],
48+
['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime'])
49+
param_names = ['connection', 'dtype']
50+
51+
def setup(self, connection, dtype):
52+
N = 10000
53+
con = {'sqlalchemy': create_engine('sqlite:///:memory:'),
54+
'sqlite': sqlite3.connect(':memory:')}
55+
self.table_name = 'test_type'
56+
self.query_col = 'SELECT {} FROM {}'.format(dtype, self.table_name)
57+
self.con = con[connection]
58+
self.df = DataFrame({'float': np.random.randn(N),
59+
'float_with_nan': np.random.randn(N),
60+
'string': ['foo'] * N,
61+
'bool': [True] * N,
62+
'int': np.random.randint(0, N, size=N),
63+
'datetime': date_range('2000-01-01',
64+
periods=N,
65+
freq='s')},
66+
index=tm.makeStringIndex(N))
67+
self.df.loc[1000:3000, 'float_with_nan'] = np.nan
68+
self.df['datetime_string'] = self.df['datetime'].astype(str)
69+
self.df.to_sql(self.table_name, self.con, if_exists='replace')
70+
71+
def time_to_sql_dataframe_column(self, connection, dtype):
72+
self.df[[dtype]].to_sql('test1', self.con, if_exists='replace')
73+
74+
def time_read_sql_query_select_column(self, connection, dtype):
75+
read_sql_query(self.query_col, self.con)
76+
77+
78+
class ReadSQLTable(object):
79+
80+
goal_time = 0.2
81+
82+
def setup(self):
83+
N = 10000
84+
self.table_name = 'test'
85+
self.con = create_engine('sqlite:///:memory:')
86+
self.df = DataFrame({'float': np.random.randn(N),
87+
'float_with_nan': np.random.randn(N),
88+
'string': ['foo'] * N,
89+
'bool': [True] * N,
90+
'int': np.random.randint(0, N, size=N),
91+
'datetime': date_range('2000-01-01',
92+
periods=N,
93+
freq='s')},
94+
index=tm.makeStringIndex(N))
95+
self.df.loc[1000:3000, 'float_with_nan'] = np.nan
96+
self.df['datetime_string'] = self.df['datetime'].astype(str)
97+
self.df.to_sql(self.table_name, self.con, if_exists='replace')
98+
99+
def time_read_sql_table_all(self):
100+
read_sql_table(self.table_name, self.con)
101+
102+
def time_read_sql_table_parse_dates(self):
103+
read_sql_table(self.table_name, self.con, columns=['datetime_string'],
104+
parse_dates=['datetime_string'])
105+
106+
107+
class ReadSQLTableDtypes(object):
108+
109+
goal_time = 0.2
110+
111+
params = ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime']
112+
param_names = ['dtype']
113+
114+
def setup(self, dtype):
115+
N = 10000
116+
self.table_name = 'test'
117+
self.con = create_engine('sqlite:///:memory:')
118+
self.df = DataFrame({'float': np.random.randn(N),
119+
'float_with_nan': np.random.randn(N),
120+
'string': ['foo'] * N,
121+
'bool': [True] * N,
122+
'int': np.random.randint(0, N, size=N),
123+
'datetime': date_range('2000-01-01',
124+
periods=N,
125+
freq='s')},
126+
index=tm.makeStringIndex(N))
127+
self.df.loc[1000:3000, 'float_with_nan'] = np.nan
128+
self.df['datetime_string'] = self.df['datetime'].astype(str)
129+
self.df.to_sql(self.table_name, self.con, if_exists='replace')
130+
131+
def time_read_sql_table_column(self, dtype):
132+
read_sql_table(self.table_name, self.con, columns=[dtype])

asv_bench/benchmarks/io/stata.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import numpy as np
2+
from pandas import DataFrame, date_range, read_stata
3+
import pandas.util.testing as tm
4+
5+
from ..pandas_vb_common import BaseIO, setup # noqa
6+
7+
8+
class Stata(BaseIO):
9+
10+
goal_time = 0.2
11+
params = ['tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty']
12+
param_names = ['convert_dates']
13+
14+
def setup(self, convert_dates):
15+
self.fname = '__test__.dta'
16+
N = 100000
17+
C = 5
18+
self.df = DataFrame(np.random.randn(N, C),
19+
columns=['float{}'.format(i) for i in range(C)],
20+
index=date_range('20000101', periods=N, freq='H'))
21+
self.df['object'] = tm.makeStringIndex(N)
22+
self.df['int8_'] = np.random.randint(np.iinfo(np.int8).min,
23+
np.iinfo(np.int8).max - 27, N)
24+
self.df['int16_'] = np.random.randint(np.iinfo(np.int16).min,
25+
np.iinfo(np.int16).max - 27, N)
26+
self.df['int32_'] = np.random.randint(np.iinfo(np.int32).min,
27+
np.iinfo(np.int32).max - 27, N)
28+
self.df['float32_'] = np.array(np.random.randn(N),
29+
dtype=np.float32)
30+
self.convert_dates = {'index': convert_dates}
31+
self.df.to_stata(self.fname, self.convert_dates)
32+
33+
def time_read_stata(self, convert_dates):
34+
read_stata(self.fname)
35+
36+
def time_write_stata(self, convert_dates):
37+
self.df.to_stata(self.fname, self.convert_dates)

0 commit comments

Comments
 (0)