Skip to content

Update asv config + fix some broken benchmarks #12563

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,10 @@ scikits

# Performance Testing #
#######################
asv_bench/
asv_bench/env/
asv_bench/html/
asv_bench/results/
asv_bench/pandas/

# Documentation generated files #
#################################
Expand Down
69 changes: 62 additions & 7 deletions asv_bench/asv.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,24 +30,62 @@

// The matrix of dependencies to test. Each key is the name of a
// package (in PyPI) and the values are version numbers. An empty
// list indicates to just test against the default (latest)
// version.
// list or empty string indicates to just test against the default
// (latest) version. null indicates that the package is to not be
// installed. If the package to be tested is only available from
// PyPi, and the 'environment_type' is conda, then you can preface
// the package name by 'pip+', and the package will be installed via
// pip (with all the conda available packages installed first,
// followed by the pip installed packages).
"matrix": {
// To run against multiple versions, replace with
// "numpy": ["1.7", "1.9"],
"numpy": [],
"Cython": [],
"matplotlib": [],
"sqlalchemy": [],
"scipy": [],
"numexpr": [],
"pytables": [],
"pytables": [null, ""], // platform dependent, see excludes below
"tables": [null, ""],
"libpython": [null, ""],
"openpyxl": [],
"xlsxwriter": [],
"xlrd": [],
"xlwt": []
},

// Combinations of libraries/python versions can be excluded/included
// from the set to test. Each entry is a dictionary containing additional
// key-value pairs to include/exclude.
//
// An exclude entry excludes entries where all values match. The
// values are regexps that should match the whole string.
//
// An include entry adds an environment. Only the packages listed
// are installed. The 'python' key is required. The exclude rules
// do not apply to includes.
//
// In addition to package names, the following keys are available:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so how does one choose the installation method / use this, ideally would ha e examples for conda (default) / pip

//
// - python
// Python version, as in the *pythons* variable above.
// - environment_type
// Environment type, as above.
// - sys_platform
// Platform, as in sys.platform. Possible values for the common
// cases: 'linux2', 'win32', 'cygwin', 'darwin'.
"exclude": [
// On conda install pytables, otherwise tables
{"environment_type": "conda", "tables": ""},
{"environment_type": "conda", "pytables": null},
{"environment_type": "virtualenv", "tables": null},
{"environment_type": "virtualenv", "pytables": ""},
// On conda&win32, install libpython
{"sys_platform": "(?!win32).*", "libpython": ""},
{"sys_platform": "win32", "libpython": null},
{"environment_type": "(?!conda).*", "libpython": ""}
],
"include": [],

// The directory (relative to the current directory) that benchmarks are
// stored in. If not provided, defaults to "benchmarks"
// "benchmark_dir": "benchmarks",
Expand All @@ -56,7 +94,6 @@
// environments in. If not provided, defaults to "env"
// "env_dir": "env",


// The directory (relative to the current directory) that raw benchmark
// results are stored in. If not provided, defaults to "results".
// "results_dir": "results",
Expand All @@ -66,5 +103,23 @@
// "html_dir": "html",

// The number of characters to retain in the commit hashes.
// "hash_length": 8
// "hash_length": 8,

// `asv` will cache wheels of the recent builds in each
// environment, making them faster to install next time. This is
// number of builds to keep, per environment.
"wheel_cache_size": 8,

// The commits after which the regression search in `asv publish`
// should start looking for regressions. Dictionary whose keys are
// regexps matching to benchmark names, and values corresponding to
// the commit (exclusive) after which to start looking for
// regressions. The default is to start from the first commit
// with results. If the commit is `null`, regression detection is
// skipped for the matching benchmark.
//
// "regressions_first_commits": {
// "some_benchmark": "352cdf", // Consider regressions only after this commit
// "another_benchmark": null, // Skip regression detection altogether
// }
}
197 changes: 22 additions & 175 deletions asv_bench/benchmarks/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,192 +3,36 @@
import pandas.computation.expressions as expr


class eval_frame_add_all_threads(object):
class eval_frame(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
self.df3 = DataFrame(np.random.randn(20000, 100))
self.df4 = DataFrame(np.random.randn(20000, 100))

def time_eval_frame_add_all_threads(self):
pd.eval('df + df2 + df3 + df4')


class eval_frame_add_one_thread(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
self.df3 = DataFrame(np.random.randn(20000, 100))
self.df4 = DataFrame(np.random.randn(20000, 100))
expr.set_numexpr_threads(1)

def time_eval_frame_add_one_thread(self):
pd.eval('df + df2 + df3 + df4')


class eval_frame_add_python(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
self.df3 = DataFrame(np.random.randn(20000, 100))
self.df4 = DataFrame(np.random.randn(20000, 100))

def time_eval_frame_add_python(self):
pd.eval('df + df2 + df3 + df4', engine='python')


class eval_frame_add_python_one_thread(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
self.df3 = DataFrame(np.random.randn(20000, 100))
self.df4 = DataFrame(np.random.randn(20000, 100))
expr.set_numexpr_threads(1)

def time_eval_frame_add_python_one_thread(self):
pd.eval('df + df2 + df3 + df4', engine='python')


class eval_frame_and_all_threads(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
self.df3 = DataFrame(np.random.randn(20000, 100))
self.df4 = DataFrame(np.random.randn(20000, 100))

def time_eval_frame_and_all_threads(self):
pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')


class eval_frame_and_python_one_thread(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
self.df3 = DataFrame(np.random.randn(20000, 100))
self.df4 = DataFrame(np.random.randn(20000, 100))
expr.set_numexpr_threads(1)

def time_eval_frame_and_python_one_thread(self):
pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')


class eval_frame_and_python(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
self.df3 = DataFrame(np.random.randn(20000, 100))
self.df4 = DataFrame(np.random.randn(20000, 100))

def time_eval_frame_and_python(self):
pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')


class eval_frame_chained_cmp_all_threads(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
self.df3 = DataFrame(np.random.randn(20000, 100))
self.df4 = DataFrame(np.random.randn(20000, 100))

def time_eval_frame_chained_cmp_all_threads(self):
pd.eval('df < df2 < df3 < df4')


class eval_frame_chained_cmp_python_one_thread(object):
goal_time = 0.2
params = [['numexpr', 'python'], [1, 'all']]
param_names = ['engine', 'threads']

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
self.df3 = DataFrame(np.random.randn(20000, 100))
self.df4 = DataFrame(np.random.randn(20000, 100))
expr.set_numexpr_threads(1)

def time_eval_frame_chained_cmp_python_one_thread(self):
pd.eval('df < df2 < df3 < df4', engine='python')


class eval_frame_chained_cmp_python(object):
goal_time = 0.2

def setup(self):
def setup(self, engine, threads):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
self.df3 = DataFrame(np.random.randn(20000, 100))
self.df4 = DataFrame(np.random.randn(20000, 100))

def time_eval_frame_chained_cmp_python(self):
pd.eval('df < df2 < df3 < df4', engine='python')
if threads == 1:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm maybe should set this explicity each time something is run to 1 (then you can set in this particular one to higher)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what would be a good value for threads=='all', could you explain how you'd like it. Note each benchmark is run in its own process, so those with threads!=1 get the default value on pandas startup.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, each is in own thread, then ok. I just didn't want to set a 'global', but if spinning off proc then its fine.

expr.set_numexpr_threads(1)

def time_add(self, engine, threads):
df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
pd.eval('df + df2 + df3 + df4', engine=engine)

class eval_frame_mult_all_threads(object):
goal_time = 0.2
def time_and(self, engine, threads):
df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine=engine)

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
self.df3 = DataFrame(np.random.randn(20000, 100))
self.df4 = DataFrame(np.random.randn(20000, 100))

def time_eval_frame_mult_all_threads(self):
pd.eval('df * df2 * df3 * df4')


class eval_frame_mult_one_thread(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
self.df3 = DataFrame(np.random.randn(20000, 100))
self.df4 = DataFrame(np.random.randn(20000, 100))
expr.set_numexpr_threads(1)

def time_eval_frame_mult_one_thread(self):
pd.eval('df * df2 * df3 * df4')


class eval_frame_mult_python(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
self.df3 = DataFrame(np.random.randn(20000, 100))
self.df4 = DataFrame(np.random.randn(20000, 100))

def time_eval_frame_mult_python(self):
pd.eval('df * df2 * df3 * df4', engine='python')


class eval_frame_mult_python_one_thread(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
self.df3 = DataFrame(np.random.randn(20000, 100))
self.df4 = DataFrame(np.random.randn(20000, 100))
expr.set_numexpr_threads(1)
def time_chained_cmp(self, engine, threads):
df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
pd.eval('df < df2 < df3 < df4', engine=engine)

def time_eval_frame_mult_python_one_thread(self):
pd.eval('df * df2 * df3 * df4', engine='python')
def time_mult(self, engine, threads):
df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
pd.eval('df * df2 * df3 * df4', engine=engine)


class query_datetime_index(object):
Expand All @@ -203,6 +47,7 @@ def setup(self):
self.df = DataFrame({'a': np.random.randn(self.N), }, index=self.index)

def time_query_datetime_index(self):
ts = self.ts
self.df.query('index < @ts')


Expand All @@ -218,6 +63,7 @@ def setup(self):
self.df = DataFrame({'dates': self.s.values, })

def time_query_datetime_series(self):
ts = self.ts
self.df.query('dates < @ts')


Expand All @@ -236,4 +82,5 @@ def setup(self):
self.max_val = self.df['a'].max()

def time_query_with_boolean_selection(self):
self.df.query('(a >= @min_val) & (a <= @max_val)')
min_val, max_val = self.min_val, self.max_val
self.df.query('(a >= @min_val) & (a <= @max_val)')
8 changes: 4 additions & 4 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def setup(self):
self.offsets[(np.random.rand(self.n) > 0.5)] = np.timedelta64('nat')
self.value2 = np.random.randn(self.n)
self.value2[(np.random.rand(self.n) > 0.5)] = np.nan
self.obj = tm.choice(list('ab'), size=self.n).astype(object)
self.obj = np.random.choice(list('ab'), size=self.n).astype(object)
self.obj[(np.random.randn(self.n) > 0.5)] = np.nan
self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n),
'key2': np.random.randint(0, 100, size=self.n),
Expand Down Expand Up @@ -651,7 +651,7 @@ class groupby_sum_multiindex(object):

def setup(self):
self.N = 50
self.df = DataFrame({'A': (range(self.N) * 2), 'B': range((self.N * 2)), 'C': 1, }).set_index(['A', 'B'])
self.df = DataFrame({'A': (list(range(self.N)) * 2), 'B': list(range((self.N * 2))), 'C': 1, }).set_index(['A', 'B'])

def time_groupby_sum_multiindex(self):
self.df.groupby(level=[0, 1]).sum()
Expand All @@ -673,9 +673,9 @@ def setup(self):
self.secid_min = int('10000000', 16)
self.secid_max = int('F0000000', 16)
self.step = ((self.secid_max - self.secid_min) // (self.n_securities - 1))
self.security_ids = map((lambda x: hex(x)[2:10].upper()), range(self.secid_min, (self.secid_max + 1), self.step))
self.security_ids = map((lambda x: hex(x)[2:10].upper()), list(range(self.secid_min, (self.secid_max + 1), self.step)))
self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids],
labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (range(self.n_securities) * self.n_dates)],
labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (list(range(self.n_securities)) * self.n_dates)],
names=['date', 'security_id'])
self.n_data = len(self.data_index)
self.columns = Index(['factor{}'.format(i) for i in range(1, (self.n_columns + 1))])
Expand Down
8 changes: 6 additions & 2 deletions asv_bench/benchmarks/packers.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,9 @@ def remove(self, f):
class packers_read_sas7bdat(object):

def setup(self):
self.f = 'data/test1.sas7bdat'
self.f = os.path.join(os.path.dirname(__file__), '..', '..',
'pandas', 'io', 'tests', 'sas', 'data',
'test1.sas7bdat')

def time_packers_read_sas7bdat(self):
pd.read_sas(self.f, format='sas7bdat')
Expand All @@ -330,7 +332,9 @@ def time_packers_read_sas7bdat(self):
class packers_read_xport(object):

def setup(self):
self.f = 'data/paxraw_d_short.xpt'
self.f = os.path.join(os.path.dirname(__file__), '..', '..',
'pandas', 'io', 'tests', 'sas', 'data',
'paxraw_d_short.xpt')

def time_packers_read_xport(self):
pd.read_sas(self.f, format='xport')
Expand Down
Loading