From e083c01ce22fd4adb91f610e25f9e4d3f1b238b0 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Wed, 9 Dec 2015 21:24:51 +0200 Subject: [PATCH 1/4] PERF: update asv.conf.json to work with both conda and virtualenv Add requirements rules for conda and virtualenv (requires asv git version). Enable asv wheel cache. --- asv_bench/asv.conf.json | 69 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 7 deletions(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 6a739873a032f..7b9fe353df2e3 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -30,24 +30,62 @@ // The matrix of dependencies to test. Each key is the name of a // package (in PyPI) and the values are version numbers. An empty - // list indicates to just test against the default (latest) - // version. + // list or empty string indicates to just test against the default + // (latest) version. null indicates that the package is to not be + // installed. If the package to be tested is only available from + // PyPi, and the 'environment_type' is conda, then you can preface + // the package name by 'pip+', and the package will be installed via + // pip (with all the conda available packages installed first, + // followed by the pip installed packages). "matrix": { - // To run against multiple versions, replace with - // "numpy": ["1.7", "1.9"], "numpy": [], "Cython": [], "matplotlib": [], "sqlalchemy": [], "scipy": [], "numexpr": [], - "pytables": [], + "pytables": [null, ""], // platform dependent, see excludes below + "tables": [null, ""], + "libpython": [null, ""], "openpyxl": [], "xlsxwriter": [], "xlrd": [], "xlwt": [] }, + // Combinations of libraries/python versions can be excluded/included + // from the set to test. Each entry is a dictionary containing additional + // key-value pairs to include/exclude. + // + // An exclude entry excludes entries where all values match. The + // values are regexps that should match the whole string. + // + // An include entry adds an environment. Only the packages listed + // are installed. The 'python' key is required. The exclude rules + // do not apply to includes. + // + // In addition to package names, the following keys are available: + // + // - python + // Python version, as in the *pythons* variable above. + // - environment_type + // Environment type, as above. + // - sys_platform + // Platform, as in sys.platform. Possible values for the common + // cases: 'linux2', 'win32', 'cygwin', 'darwin'. + "exclude": [ + // On conda install pytables, otherwise tables + {"environment_type": "conda", "tables": ""}, + {"environment_type": "conda", "pytables": null}, + {"environment_type": "virtualenv", "tables": null}, + {"environment_type": "virtualenv", "pytables": ""}, + // On conda&win32, install libpython + {"sys_platform": "(?!win32).*", "libpython": ""}, + {"sys_platform": "win32", "libpython": null}, + {"environment_type": "(?!conda).*", "libpython": ""} + ], + "include": [], + // The directory (relative to the current directory) that benchmarks are // stored in. If not provided, defaults to "benchmarks" // "benchmark_dir": "benchmarks", @@ -56,7 +94,6 @@ // environments in. If not provided, defaults to "env" // "env_dir": "env", - // The directory (relative to the current directory) that raw benchmark // results are stored in. If not provided, defaults to "results". // "results_dir": "results", @@ -66,5 +103,23 @@ // "html_dir": "html", // The number of characters to retain in the commit hashes. - // "hash_length": 8 + // "hash_length": 8, + + // `asv` will cache wheels of the recent builds in each + // environment, making them faster to install next time. This is + // number of builds to keep, per environment. + "wheel_cache_size": 8, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // } } From 448b36af422d24a745fd103d9e31af75e8836ef7 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Tue, 8 Mar 2016 21:18:47 +0200 Subject: [PATCH 2/4] PERF: fix easily fixed issues in asv benchmarks - eval.eval_frame_*: fix up local variables - groupby: range() -> list(range()) and using np.random.choice - plotting: ensure matplotlib Agg backend is used - packers: fix data file paths --- asv_bench/benchmarks/eval.py | 197 ++++--------------------------- asv_bench/benchmarks/groupby.py | 8 +- asv_bench/benchmarks/packers.py | 8 +- asv_bench/benchmarks/plotting.py | 5 + 4 files changed, 37 insertions(+), 181 deletions(-) diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index 719d92567a7be..d9978e0cc4595 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -3,192 +3,36 @@ import pandas.computation.expressions as expr -class eval_frame_add_all_threads(object): +class eval_frame(object): goal_time = 0.2 - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - - def time_eval_frame_add_all_threads(self): - pd.eval('df + df2 + df3 + df4') - - -class eval_frame_add_one_thread(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_eval_frame_add_one_thread(self): - pd.eval('df + df2 + df3 + df4') - - -class eval_frame_add_python(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - - def time_eval_frame_add_python(self): - pd.eval('df + df2 + df3 + df4', engine='python') - - -class eval_frame_add_python_one_thread(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_eval_frame_add_python_one_thread(self): - pd.eval('df + df2 + df3 + df4', engine='python') - - -class eval_frame_and_all_threads(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - - def time_eval_frame_and_all_threads(self): - pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)') - - -class eval_frame_and_python_one_thread(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_eval_frame_and_python_one_thread(self): - pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python') - - -class eval_frame_and_python(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - - def time_eval_frame_and_python(self): - pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python') - - -class eval_frame_chained_cmp_all_threads(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - - def time_eval_frame_chained_cmp_all_threads(self): - pd.eval('df < df2 < df3 < df4') - - -class eval_frame_chained_cmp_python_one_thread(object): - goal_time = 0.2 + params = [['numexpr', 'python'], [1, 'all']] + param_names = ['engine', 'threads'] - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_eval_frame_chained_cmp_python_one_thread(self): - pd.eval('df < df2 < df3 < df4', engine='python') - - -class eval_frame_chained_cmp_python(object): - goal_time = 0.2 - - def setup(self): + def setup(self, engine, threads): self.df = DataFrame(np.random.randn(20000, 100)) self.df2 = DataFrame(np.random.randn(20000, 100)) self.df3 = DataFrame(np.random.randn(20000, 100)) self.df4 = DataFrame(np.random.randn(20000, 100)) - def time_eval_frame_chained_cmp_python(self): - pd.eval('df < df2 < df3 < df4', engine='python') + if threads == 1: + expr.set_numexpr_threads(1) + def time_add(self, engine, threads): + df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 + pd.eval('df + df2 + df3 + df4', engine=engine) -class eval_frame_mult_all_threads(object): - goal_time = 0.2 + def time_and(self, engine, threads): + df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 + pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine=engine) - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - - def time_eval_frame_mult_all_threads(self): - pd.eval('df * df2 * df3 * df4') - - -class eval_frame_mult_one_thread(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) - - def time_eval_frame_mult_one_thread(self): - pd.eval('df * df2 * df3 * df4') - - -class eval_frame_mult_python(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - - def time_eval_frame_mult_python(self): - pd.eval('df * df2 * df3 * df4', engine='python') - - -class eval_frame_mult_python_one_thread(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) - expr.set_numexpr_threads(1) + def time_chained_cmp(self, engine, threads): + df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 + pd.eval('df < df2 < df3 < df4', engine=engine) - def time_eval_frame_mult_python_one_thread(self): - pd.eval('df * df2 * df3 * df4', engine='python') + def time_mult(self, engine, threads): + df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 + pd.eval('df * df2 * df3 * df4', engine=engine) class query_datetime_index(object): @@ -203,6 +47,7 @@ def setup(self): self.df = DataFrame({'a': np.random.randn(self.N), }, index=self.index) def time_query_datetime_index(self): + ts = self.ts self.df.query('index < @ts') @@ -218,6 +63,7 @@ def setup(self): self.df = DataFrame({'dates': self.s.values, }) def time_query_datetime_series(self): + ts = self.ts self.df.query('dates < @ts') @@ -236,4 +82,5 @@ def setup(self): self.max_val = self.df['a'].max() def time_query_with_boolean_selection(self): - self.df.query('(a >= @min_val) & (a <= @max_val)') \ No newline at end of file + min_val, max_val = self.min_val, self.max_val + self.df.query('(a >= @min_val) & (a <= @max_val)') diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 48480041ed1bd..7279d73eb0d97 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -254,7 +254,7 @@ def setup(self): self.offsets[(np.random.rand(self.n) > 0.5)] = np.timedelta64('nat') self.value2 = np.random.randn(self.n) self.value2[(np.random.rand(self.n) > 0.5)] = np.nan - self.obj = tm.choice(list('ab'), size=self.n).astype(object) + self.obj = np.random.choice(list('ab'), size=self.n).astype(object) self.obj[(np.random.randn(self.n) > 0.5)] = np.nan self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), @@ -651,7 +651,7 @@ class groupby_sum_multiindex(object): def setup(self): self.N = 50 - self.df = DataFrame({'A': (range(self.N) * 2), 'B': range((self.N * 2)), 'C': 1, }).set_index(['A', 'B']) + self.df = DataFrame({'A': (list(range(self.N)) * 2), 'B': list(range((self.N * 2))), 'C': 1, }).set_index(['A', 'B']) def time_groupby_sum_multiindex(self): self.df.groupby(level=[0, 1]).sum() @@ -673,9 +673,9 @@ def setup(self): self.secid_min = int('10000000', 16) self.secid_max = int('F0000000', 16) self.step = ((self.secid_max - self.secid_min) // (self.n_securities - 1)) - self.security_ids = map((lambda x: hex(x)[2:10].upper()), range(self.secid_min, (self.secid_max + 1), self.step)) + self.security_ids = map((lambda x: hex(x)[2:10].upper()), list(range(self.secid_min, (self.secid_max + 1), self.step))) self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids], - labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (range(self.n_securities) * self.n_dates)], + labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (list(range(self.n_securities)) * self.n_dates)], names=['date', 'security_id']) self.n_data = len(self.data_index) self.columns = Index(['factor{}'.format(i) for i in range(1, (self.n_columns + 1))]) diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index 74bec72eb05e9..3f80c4c0c6338 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -321,7 +321,9 @@ def remove(self, f): class packers_read_sas7bdat(object): def setup(self): - self.f = 'data/test1.sas7bdat' + self.f = os.path.join(os.path.dirname(__file__), '..', '..', + 'pandas', 'io', 'tests', 'sas', 'data', + 'test1.sas7bdat') def time_packers_read_sas7bdat(self): pd.read_sas(self.f, format='sas7bdat') @@ -330,7 +332,9 @@ def time_packers_read_sas7bdat(self): class packers_read_xport(object): def setup(self): - self.f = 'data/paxraw_d_short.xpt' + self.f = os.path.join(os.path.dirname(__file__), '..', '..', + 'pandas', 'io', 'tests', 'sas', 'data', + 'paxraw_d_short.xpt') def time_packers_read_xport(self): pd.read_sas(self.f, format='xport') diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index d2ef65ed27ef3..7a4a98e2195c2 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -11,6 +11,8 @@ class plot_timeseries_period(object): goal_time = 0.2 def setup(self): + import matplotlib + matplotlib.use('Agg') self.N = 2000 self.M = 5 self.df = DataFrame(np.random.randn(self.N, self.M), index=date_range('1/1/1975', periods=self.N)) @@ -18,10 +20,13 @@ def setup(self): def time_plot_timeseries_period(self): self.df.plot() + class plot_andrews_curves(object): goal_time = 0.6 def setup(self): + import matplotlib + matplotlib.use('Agg') self.N = 500 self.M = 10 data_dict = {x: np.random.randn(self.N) for x in range(self.M)} From 65db647cc74b85457fbb5ec959447e252ccb0a6d Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Tue, 8 Mar 2016 22:28:00 +0200 Subject: [PATCH 3/4] CLN: more precise asv_bench ignores in .gitignore --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 06ff0b90b8345..d987bab6fd5d7 100644 --- a/.gitignore +++ b/.gitignore @@ -83,7 +83,10 @@ scikits # Performance Testing # ####################### -asv_bench/ +asv_bench/env/ +asv_bench/html/ +asv_bench/results/ +asv_bench/pandas/ # Documentation generated files # ################################# From 8cba84daf19a319db2d6c93c11ec71ed499e7b4c Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sun, 24 Apr 2016 17:10:05 +0200 Subject: [PATCH 4/4] DOC: contributing: explain how to tell asv which environment to use in more detail --- doc/source/contributing.rst | 38 +++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 29e7cc96dcc87..e64ff4c155132 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -547,8 +547,8 @@ with an imported pandas to run tests similarly. Running the performance test suite ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Performance matters and it is worth considering whether your code has introduced -performance regressions. *pandas* is in the process of migrating to the -`asv library `__ +performance regressions. *pandas* is in the process of migrating to +`asv benchmarks `__ to enable easy monitoring of the performance of critical *pandas* operations. These benchmarks are all found in the ``pandas/asv_bench`` directory. asv supports both python2 and python3. @@ -559,8 +559,9 @@ supports both python2 and python3. so many stylistic issues are likely a result of automated transformation of the code. -To use asv you will need either ``conda`` or ``virtualenv``. For more details -please check the `asv installation webpage `_. +To use all features of asv, you will need either ``conda`` or +``virtualenv``. For more details please check the `asv installation +webpage `_. To install asv:: @@ -571,6 +572,14 @@ the following if you have been developing on ``master``:: asv continuous master +This command uses ``conda`` by default for creating the benchmark +environments. If you want to use virtualenv instead, write:: + + asv continuous -E virtualenv master + +The ``-E virtualenv`` option should be added to all ``asv`` commands +that run benchmarks. The default value is defined in ``asv.conf.json``. + If you are working on another branch, either of the following can be used:: asv continuous master HEAD @@ -595,17 +604,26 @@ using ``.`` as a separator. For example:: will only run a ``groupby_agg_builtins1`` test defined in a ``groupby`` file. -It can also be useful to run tests in your current environment. You can simply do it by:: +You can also run the benchmark suite using the version of ``pandas`` +already installed in your current Python environment. This can be +useful if you do not have virtualenv or conda, or are using the +``setup.py develop`` approach discussed above; for the in-place build +you need to set ``PYTHONPATH``, e.g. +``PYTHONPATH="$PWD/.." asv [remaining arguments]``. +You can run benchmarks using an existing Python +environment by:: - asv dev + asv run -e -E existing -This command is equivalent to:: +or, to use a specific Python interpreter,:: - asv run --quick --show-stderr --python=same + asv run -e -E existing:python3.5 -This will launch every test only once, display stderr from the benchmarks, and use your local ``python`` that comes from your ``$PATH``. +This will display stderr from the benchmarks, and use your local +``python`` that comes from your ``$PATH``. -Information on how to write a benchmark can be found in the `asv documentation `_. +Information on how to write a benchmark and how to use asv can be found in the +`asv documentation `_. Running the vbench performance test suite (phasing out) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~