CLN: ASV io benchmarks (#18906)

mroeschke · jreback · commit b9cc8212b69b · 2017-12-23T15:04:43.000-05:00
diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py
@@ -0,0 +1,37 @@
+import numpy as np
+from pandas import DataFrame, date_range, ExcelWriter, read_excel
+from pandas.compat import BytesIO
+import pandas.util.testing as tm
+
+from ..pandas_vb_common import BaseIO, setup  # noqa
+
+
+class Excel(object):
+
+    goal_time = 0.2
+    params = ['openpyxl', 'xlsxwriter', 'xlwt']
+    param_names = ['engine']
+
+    def setup(self, engine):
+        N = 2000
+        C = 5
+        self.df = DataFrame(np.random.randn(N, C),
+                            columns=['float{}'.format(i) for i in range(C)],
+                            index=date_range('20000101', periods=N, freq='H'))
+        self.df['object'] = tm.makeStringIndex(N)
+        self.bio_read = BytesIO()
+        self.writer_read = ExcelWriter(self.bio_read, engine=engine)
+        self.df.to_excel(self.writer_read, sheet_name='Sheet1')
+        self.writer_read.save()
+        self.bio_read.seek(0)
+
+        self.bio_write = BytesIO()
+        self.bio_write.seek(0)
+        self.writer_write = ExcelWriter(self.bio_write, engine=engine)
+
+    def time_read_excel(self, engine):
+        read_excel(self.bio_read)
+
+    def time_write_excel(self, engine):
+        self.df.to_excel(self.writer_write, sheet_name='Sheet1')
+        self.writer_write.save()
diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py
@@ -1,11 +1,11 @@
 import numpy as np
-from pandas import DataFrame, Panel, date_range, HDFStore
+from pandas import DataFrame, Panel, date_range, HDFStore, read_hdf
 import pandas.util.testing as tm
 
-from .pandas_vb_common import BaseIO, setup  # noqa
+from ..pandas_vb_common import BaseIO, setup  # noqa
 
 
-class HDF5(BaseIO):
+class HDFStoreDataFrame(BaseIO):
 
     goal_time = 0.2
 
@@ -34,9 +34,9 @@ def setup(self):
         self.df_dc = DataFrame(np.random.randn(N, 10),
                                columns=['C%03d' % i for i in range(10)])
 
-        self.f = '__test__.h5'
+        self.fname = '__test__.h5'
 
-        self.store = HDFStore(self.f)
+        self.store = HDFStore(self.fname)
         self.store.put('fixed', self.df)
         self.store.put('fixed_mixed', self.df_mixed)
         self.store.append('table', self.df2)
@@ -46,7 +46,7 @@ def setup(self):
 
     def teardown(self):
         self.store.close()
-        self.remove(self.f)
+        self.remove(self.fname)
 
     def time_read_store(self):
         self.store.get('fixed')
@@ -99,25 +99,48 @@ def time_store_info(self):
         self.store.info()
 
 
-class HDF5Panel(BaseIO):
+class HDFStorePanel(BaseIO):
 
     goal_time = 0.2
 
     def setup(self):
-        self.f = '__test__.h5'
+        self.fname = '__test__.h5'
         self.p = Panel(np.random.randn(20, 1000, 25),
                        items=['Item%03d' % i for i in range(20)],
                        major_axis=date_range('1/1/2000', periods=1000),
                        minor_axis=['E%03d' % i for i in range(25)])
-        self.store = HDFStore(self.f)
+        self.store = HDFStore(self.fname)
         self.store.append('p1', self.p)
 
     def teardown(self):
         self.store.close()
-        self.remove(self.f)
+        self.remove(self.fname)
 
     def time_read_store_table_panel(self):
         self.store.select('p1')
 
     def time_write_store_table_panel(self):
         self.store.append('p2', self.p)
+
+
+class HDF(BaseIO):
+
+    goal_time = 0.2
+    params = ['table', 'fixed']
+    param_names = ['format']
+
+    def setup(self, format):
+        self.fname = '__test__.h5'
+        N = 100000
+        C = 5
+        self.df = DataFrame(np.random.randn(N, C),
+                            columns=['float{}'.format(i) for i in range(C)],
+                            index=date_range('20000101', periods=N, freq='H'))
+        self.df['object'] = tm.makeStringIndex(N)
+        self.df.to_hdf(self.fname, 'df', format=format)
+
+    def time_read_hdf(self, format):
+        read_hdf(self.fname, 'df')
+
+    def time_write_hdf(self, format):
+        self.df.to_hdf(self.fname, 'df', format=format)
diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py
@@ -0,0 +1,26 @@
+import numpy as np
+from pandas import DataFrame, date_range, read_msgpack
+import pandas.util.testing as tm
+
+from ..pandas_vb_common import BaseIO, setup  # noqa
+
+
+class MSGPack(BaseIO):
+
+    goal_time = 0.2
+
+    def setup(self):
+        self.fname = '__test__.msg'
+        N = 100000
+        C = 5
+        self.df = DataFrame(np.random.randn(N, C),
+                            columns=['float{}'.format(i) for i in range(C)],
+                            index=date_range('20000101', periods=N, freq='H'))
+        self.df['object'] = tm.makeStringIndex(N)
+        self.df.to_msgpack(self.fname)
+
+    def time_read_msgpack(self):
+        read_msgpack(self.fname)
+
+    def time_write_msgpack(self):
+        self.df.to_msgpack(self.fname)
diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py
@@ -0,0 +1,26 @@
+import numpy as np
+from pandas import DataFrame, date_range, read_pickle
+import pandas.util.testing as tm
+
+from ..pandas_vb_common import BaseIO, setup  # noqa
+
+
+class Pickle(BaseIO):
+
+    goal_time = 0.2
+
+    def setup(self):
+        self.fname = '__test__.pkl'
+        N = 100000
+        C = 5
+        self.df = DataFrame(np.random.randn(N, C),
+                            columns=['float{}'.format(i) for i in range(C)],
+                            index=date_range('20000101', periods=N, freq='H'))
+        self.df['object'] = tm.makeStringIndex(N)
+        self.df.to_pickle(self.fname)
+
+    def time_read_pickle(self):
+        read_pickle(self.fname)
+
+    def time_write_pickle(self):
+        self.df.to_pickle(self.fname)
diff --git a/asv_bench/benchmarks/io/sas.py b/asv_bench/benchmarks/io/sas.py
@@ -0,0 +1,21 @@
+import os
+
+from pandas import read_sas
+
+
+class SAS(object):
+
+    goal_time = 0.2
+    params = ['sas7bdat', 'xport']
+    param_names = ['format']
+
+    def setup(self, format):
+        # Read files that are located in 'pandas/io/tests/sas/data'
+        files = {'sas7bdat': 'test1.sas7bdat', 'xport': 'paxraw_d_short.xpt'}
+        file = files[format]
+        paths = [os.path.dirname(__file__), '..', '..', '..', 'pandas',
+                 'tests', 'io', 'sas', 'data', file]
+        self.f = os.path.join(*paths)
+
+    def time_read_msgpack(self, format):
+        read_sas(self.f, format=format)
diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py
@@ -0,0 +1,132 @@
+import sqlite3
+
+import numpy as np
+import pandas.util.testing as tm
+from pandas import DataFrame, date_range, read_sql_query, read_sql_table
+from sqlalchemy import create_engine
+
+from ..pandas_vb_common import setup  # noqa
+
+
+class SQL(object):
+
+    goal_time = 0.2
+    params = ['sqlalchemy', 'sqlite']
+    param_names = ['connection']
+
+    def setup(self, connection):
+        N = 10000
+        con = {'sqlalchemy': create_engine('sqlite:///:memory:'),
+               'sqlite': sqlite3.connect(':memory:')}
+        self.table_name = 'test_type'
+        self.query_all = 'SELECT * FROM {}'.format(self.table_name)
+        self.con = con[connection]
+        self.df = DataFrame({'float': np.random.randn(N),
+                             'float_with_nan': np.random.randn(N),
+                             'string': ['foo'] * N,
+                             'bool': [True] * N,
+                             'int': np.random.randint(0, N, size=N),
+                             'datetime': date_range('2000-01-01',
+                                                    periods=N,
+                                                    freq='s')},
+                            index=tm.makeStringIndex(N))
+        self.df.loc[1000:3000, 'float_with_nan'] = np.nan
+        self.df['datetime_string'] = self.df['datetime'].astype(str)
+        self.df.to_sql(self.table_name, self.con, if_exists='replace')
+
+    def time_to_sql_dataframe(self, connection):
+        self.df.to_sql('test1', self.con, if_exists='replace')
+
+    def time_read_sql_query(self, connection):
+        read_sql_query(self.query_all, self.con)
+
+
+class WriteSQLDtypes(object):
+
+    goal_time = 0.2
+    params = (['sqlalchemy', 'sqlite'],
+              ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime'])
+    param_names = ['connection', 'dtype']
+
+    def setup(self, connection, dtype):
+        N = 10000
+        con = {'sqlalchemy': create_engine('sqlite:///:memory:'),
+               'sqlite': sqlite3.connect(':memory:')}
+        self.table_name = 'test_type'
+        self.query_col = 'SELECT {} FROM {}'.format(dtype, self.table_name)
+        self.con = con[connection]
+        self.df = DataFrame({'float': np.random.randn(N),
+                             'float_with_nan': np.random.randn(N),
+                             'string': ['foo'] * N,
+                             'bool': [True] * N,
+                             'int': np.random.randint(0, N, size=N),
+                             'datetime': date_range('2000-01-01',
+                                                    periods=N,
+                                                    freq='s')},
+                            index=tm.makeStringIndex(N))
+        self.df.loc[1000:3000, 'float_with_nan'] = np.nan
+        self.df['datetime_string'] = self.df['datetime'].astype(str)
+        self.df.to_sql(self.table_name, self.con, if_exists='replace')
+
+    def time_to_sql_dataframe_column(self, connection, dtype):
+        self.df[[dtype]].to_sql('test1', self.con, if_exists='replace')
+
+    def time_read_sql_query_select_column(self, connection, dtype):
+        read_sql_query(self.query_col, self.con)
+
+
+class ReadSQLTable(object):
+
+    goal_time = 0.2
+
+    def setup(self):
+        N = 10000
+        self.table_name = 'test'
+        self.con = create_engine('sqlite:///:memory:')
+        self.df = DataFrame({'float': np.random.randn(N),
+                             'float_with_nan': np.random.randn(N),
+                             'string': ['foo'] * N,
+                             'bool': [True] * N,
+                             'int': np.random.randint(0, N, size=N),
+                             'datetime': date_range('2000-01-01',
+                                                    periods=N,
+                                                    freq='s')},
+                            index=tm.makeStringIndex(N))
+        self.df.loc[1000:3000, 'float_with_nan'] = np.nan
+        self.df['datetime_string'] = self.df['datetime'].astype(str)
+        self.df.to_sql(self.table_name, self.con, if_exists='replace')
+
+    def time_read_sql_table_all(self):
+        read_sql_table(self.table_name, self.con)
+
+    def time_read_sql_table_parse_dates(self):
+        read_sql_table(self.table_name, self.con, columns=['datetime_string'],
+                       parse_dates=['datetime_string'])
+
+
+class ReadSQLTableDtypes(object):
+
+    goal_time = 0.2
+
+    params = ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime']
+    param_names = ['dtype']
+
+    def setup(self, dtype):
+        N = 10000
+        self.table_name = 'test'
+        self.con = create_engine('sqlite:///:memory:')
+        self.df = DataFrame({'float': np.random.randn(N),
+                             'float_with_nan': np.random.randn(N),
+                             'string': ['foo'] * N,
+                             'bool': [True] * N,
+                             'int': np.random.randint(0, N, size=N),
+                             'datetime': date_range('2000-01-01',
+                                                    periods=N,
+                                                    freq='s')},
+                            index=tm.makeStringIndex(N))
+        self.df.loc[1000:3000, 'float_with_nan'] = np.nan
+        self.df['datetime_string'] = self.df['datetime'].astype(str)
+        self.df.to_sql(self.table_name, self.con, if_exists='replace')
+
+    def time_read_sql_table_column(self, dtype):
+        read_sql_table(self.table_name, self.con, columns=[dtype])
diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py
@@ -0,0 +1,37 @@
+import numpy as np
+from pandas import DataFrame, date_range, read_stata
+import pandas.util.testing as tm
+
+from ..pandas_vb_common import BaseIO, setup  # noqa
+
+
+class Stata(BaseIO):
+
+    goal_time = 0.2
+    params = ['tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty']
+    param_names = ['convert_dates']
+
+    def setup(self, convert_dates):
+        self.fname = '__test__.dta'
+        N = 100000
+        C = 5
+        self.df = DataFrame(np.random.randn(N, C),
+                            columns=['float{}'.format(i) for i in range(C)],
+                            index=date_range('20000101', periods=N, freq='H'))
+        self.df['object'] = tm.makeStringIndex(N)
+        self.df['int8_'] = np.random.randint(np.iinfo(np.int8).min,
+                                             np.iinfo(np.int8).max - 27, N)
+        self.df['int16_'] = np.random.randint(np.iinfo(np.int16).min,
+                                              np.iinfo(np.int16).max - 27, N)
+        self.df['int32_'] = np.random.randint(np.iinfo(np.int32).min,
+                                              np.iinfo(np.int32).max - 27, N)
+        self.df['float32_'] = np.array(np.random.randn(N),
+                                       dtype=np.float32)
+        self.convert_dates = {'index': convert_dates}
+        self.df.to_stata(self.fname, self.convert_dates)
+
+    def time_read_stata(self, convert_dates):
+        read_stata(self.fname)
+
+    def time_write_stata(self, convert_dates):
+        self.df.to_stata(self.fname, self.convert_dates)
diff --git a/asv_bench/benchmarks/io_sql.py b/asv_bench/benchmarks/io_sql.py
diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py