From e083c01ce22fd4adb91f610e25f9e4d3f1b238b0 Mon Sep 17 00:00:00 2001
From: Pauli Virtanen <pav@iki.fi>
Date: Wed, 9 Dec 2015 21:24:51 +0200
Subject: [PATCH 1/4] PERF: update asv.conf.json to work with both conda and
 virtualenv

Add requirements rules for conda and virtualenv (requires asv git
version). Enable asv wheel cache.
---
 asv_bench/asv.conf.json | 69 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 7 deletions(-)

diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
index 6a739873a032f..7b9fe353df2e3 100644
--- a/asv_bench/asv.conf.json
+++ b/asv_bench/asv.conf.json
@@ -30,24 +30,62 @@
 
     // The matrix of dependencies to test.  Each key is the name of a
     // package (in PyPI) and the values are version numbers.  An empty
-    // list indicates to just test against the default (latest)
-    // version.
+    // list or empty string indicates to just test against the default
+    // (latest) version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed via
+    // pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
     "matrix": {
-        // To run against multiple versions, replace with
-        // "numpy": ["1.7", "1.9"],
         "numpy": [],
         "Cython": [],
         "matplotlib": [],
         "sqlalchemy": [],
         "scipy": [],
         "numexpr": [],
-        "pytables": [],
+        "pytables": [null, ""],  // platform dependent, see excludes below
+        "tables": [null, ""],
+        "libpython": [null, ""],
         "openpyxl": [],
         "xlsxwriter": [],
         "xlrd": [],
         "xlwt": []
     },
 
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    "exclude": [
+        // On conda install pytables, otherwise tables
+        {"environment_type": "conda", "tables": ""},
+        {"environment_type": "conda", "pytables": null},
+        {"environment_type": "virtualenv", "tables": null},
+        {"environment_type": "virtualenv", "pytables": ""},
+        // On conda&win32, install libpython
+        {"sys_platform": "(?!win32).*", "libpython": ""},
+        {"sys_platform": "win32", "libpython": null},
+        {"environment_type": "(?!conda).*", "libpython": ""}
+    ],
+    "include": [],
+
     // The directory (relative to the current directory) that benchmarks are
     // stored in.  If not provided, defaults to "benchmarks"
     // "benchmark_dir": "benchmarks",
@@ -56,7 +94,6 @@
     // environments in.  If not provided, defaults to "env"
     // "env_dir": "env",
 
-
     // The directory (relative to the current directory) that raw benchmark
     // results are stored in.  If not provided, defaults to "results".
     // "results_dir": "results",
@@ -66,5 +103,23 @@
     // "html_dir": "html",
 
     // The number of characters to retain in the commit hashes.
-    // "hash_length": 8
+    // "hash_length": 8,
+
+    // `asv` will cache wheels of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // number of builds to keep, per environment.
+    "wheel_cache_size": 8,
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // }
 }

From 448b36af422d24a745fd103d9e31af75e8836ef7 Mon Sep 17 00:00:00 2001
From: Pauli Virtanen <pav@iki.fi>
Date: Tue, 8 Mar 2016 21:18:47 +0200
Subject: [PATCH 2/4] PERF: fix easily fixed issues in asv benchmarks

- eval.eval_frame_*: fix up local variables
- groupby: range() -> list(range()) and using np.random.choice
- plotting: ensure matplotlib Agg backend is used
- packers: fix data file paths
---
 asv_bench/benchmarks/eval.py     | 197 ++++---------------------------
 asv_bench/benchmarks/groupby.py  |   8 +-
 asv_bench/benchmarks/packers.py  |   8 +-
 asv_bench/benchmarks/plotting.py |   5 +
 4 files changed, 37 insertions(+), 181 deletions(-)

diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py
index 719d92567a7be..d9978e0cc4595 100644
--- a/asv_bench/benchmarks/eval.py
+++ b/asv_bench/benchmarks/eval.py
@@ -3,192 +3,36 @@
 import pandas.computation.expressions as expr
 
 
-class eval_frame_add_all_threads(object):
+class eval_frame(object):
     goal_time = 0.2
 
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-
-    def time_eval_frame_add_all_threads(self):
-        pd.eval('df + df2 + df3 + df4')
-
-
-class eval_frame_add_one_thread(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-        expr.set_numexpr_threads(1)
-
-    def time_eval_frame_add_one_thread(self):
-        pd.eval('df + df2 + df3 + df4')
-
-
-class eval_frame_add_python(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-
-    def time_eval_frame_add_python(self):
-        pd.eval('df + df2 + df3 + df4', engine='python')
-
-
-class eval_frame_add_python_one_thread(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-        expr.set_numexpr_threads(1)
-
-    def time_eval_frame_add_python_one_thread(self):
-        pd.eval('df + df2 + df3 + df4', engine='python')
-
-
-class eval_frame_and_all_threads(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-
-    def time_eval_frame_and_all_threads(self):
-        pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')
-
-
-class eval_frame_and_python_one_thread(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-        expr.set_numexpr_threads(1)
-
-    def time_eval_frame_and_python_one_thread(self):
-        pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')
-
-
-class eval_frame_and_python(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-
-    def time_eval_frame_and_python(self):
-        pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')
-
-
-class eval_frame_chained_cmp_all_threads(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-
-    def time_eval_frame_chained_cmp_all_threads(self):
-        pd.eval('df < df2 < df3 < df4')
-
-
-class eval_frame_chained_cmp_python_one_thread(object):
-    goal_time = 0.2
+    params = [['numexpr', 'python'], [1, 'all']]
+    param_names = ['engine', 'threads']
 
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-        expr.set_numexpr_threads(1)
-
-    def time_eval_frame_chained_cmp_python_one_thread(self):
-        pd.eval('df < df2 < df3 < df4', engine='python')
-
-
-class eval_frame_chained_cmp_python(object):
-    goal_time = 0.2
-
-    def setup(self):
+    def setup(self, engine, threads):
         self.df = DataFrame(np.random.randn(20000, 100))
         self.df2 = DataFrame(np.random.randn(20000, 100))
         self.df3 = DataFrame(np.random.randn(20000, 100))
         self.df4 = DataFrame(np.random.randn(20000, 100))
 
-    def time_eval_frame_chained_cmp_python(self):
-        pd.eval('df < df2 < df3 < df4', engine='python')
+        if threads == 1:
+            expr.set_numexpr_threads(1)
 
+    def time_add(self, engine, threads):
+        df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
+        pd.eval('df + df2 + df3 + df4', engine=engine)
 
-class eval_frame_mult_all_threads(object):
-    goal_time = 0.2
+    def time_and(self, engine, threads):
+        df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
+        pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine=engine)
 
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-
-    def time_eval_frame_mult_all_threads(self):
-        pd.eval('df * df2 * df3 * df4')
-
-
-class eval_frame_mult_one_thread(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-        expr.set_numexpr_threads(1)
-
-    def time_eval_frame_mult_one_thread(self):
-        pd.eval('df * df2 * df3 * df4')
-
-
-class eval_frame_mult_python(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-
-    def time_eval_frame_mult_python(self):
-        pd.eval('df * df2 * df3 * df4', engine='python')
-
-
-class eval_frame_mult_python_one_thread(object):
-    goal_time = 0.2
-
-    def setup(self):
-        self.df = DataFrame(np.random.randn(20000, 100))
-        self.df2 = DataFrame(np.random.randn(20000, 100))
-        self.df3 = DataFrame(np.random.randn(20000, 100))
-        self.df4 = DataFrame(np.random.randn(20000, 100))
-        expr.set_numexpr_threads(1)
+    def time_chained_cmp(self, engine, threads):
+        df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
+        pd.eval('df < df2 < df3 < df4', engine=engine)
 
-    def time_eval_frame_mult_python_one_thread(self):
-        pd.eval('df * df2 * df3 * df4', engine='python')
+    def time_mult(self, engine, threads):
+        df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4
+        pd.eval('df * df2 * df3 * df4', engine=engine)
 
 
 class query_datetime_index(object):
@@ -203,6 +47,7 @@ def setup(self):
         self.df = DataFrame({'a': np.random.randn(self.N), }, index=self.index)
 
     def time_query_datetime_index(self):
+        ts = self.ts
         self.df.query('index < @ts')
 
 
@@ -218,6 +63,7 @@ def setup(self):
         self.df = DataFrame({'dates': self.s.values, })
 
     def time_query_datetime_series(self):
+        ts = self.ts
         self.df.query('dates < @ts')
 
 
@@ -236,4 +82,5 @@ def setup(self):
         self.max_val = self.df['a'].max()
 
     def time_query_with_boolean_selection(self):
-        self.df.query('(a >= @min_val) & (a <= @max_val)')
\ No newline at end of file
+        min_val, max_val = self.min_val, self.max_val
+        self.df.query('(a >= @min_val) & (a <= @max_val)')
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 48480041ed1bd..7279d73eb0d97 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -254,7 +254,7 @@ def setup(self):
         self.offsets[(np.random.rand(self.n) > 0.5)] = np.timedelta64('nat')
         self.value2 = np.random.randn(self.n)
         self.value2[(np.random.rand(self.n) > 0.5)] = np.nan
-        self.obj = tm.choice(list('ab'), size=self.n).astype(object)
+        self.obj = np.random.choice(list('ab'), size=self.n).astype(object)
         self.obj[(np.random.randn(self.n) > 0.5)] = np.nan
         self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n),
                              'key2': np.random.randint(0, 100, size=self.n),
@@ -651,7 +651,7 @@ class groupby_sum_multiindex(object):
 
     def setup(self):
         self.N = 50
-        self.df = DataFrame({'A': (range(self.N) * 2), 'B': range((self.N * 2)), 'C': 1, }).set_index(['A', 'B'])
+        self.df = DataFrame({'A': (list(range(self.N)) * 2), 'B': list(range((self.N * 2))), 'C': 1, }).set_index(['A', 'B'])
 
     def time_groupby_sum_multiindex(self):
         self.df.groupby(level=[0, 1]).sum()
@@ -673,9 +673,9 @@ def setup(self):
         self.secid_min = int('10000000', 16)
         self.secid_max = int('F0000000', 16)
         self.step = ((self.secid_max - self.secid_min) // (self.n_securities - 1))
-        self.security_ids = map((lambda x: hex(x)[2:10].upper()), range(self.secid_min, (self.secid_max + 1), self.step))
+        self.security_ids = map((lambda x: hex(x)[2:10].upper()), list(range(self.secid_min, (self.secid_max + 1), self.step)))
         self.data_index = MultiIndex(levels=[self.dates.values, self.security_ids],
-                                     labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (range(self.n_securities) * self.n_dates)],
+                                     labels=[[i for i in range(self.n_dates) for _ in range(self.n_securities)], (list(range(self.n_securities)) * self.n_dates)],
                                      names=['date', 'security_id'])
         self.n_data = len(self.data_index)
         self.columns = Index(['factor{}'.format(i) for i in range(1, (self.n_columns + 1))])
diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py
index 74bec72eb05e9..3f80c4c0c6338 100644
--- a/asv_bench/benchmarks/packers.py
+++ b/asv_bench/benchmarks/packers.py
@@ -321,7 +321,9 @@ def remove(self, f):
 class packers_read_sas7bdat(object):
 
     def setup(self):
-        self.f = 'data/test1.sas7bdat'
+        self.f = os.path.join(os.path.dirname(__file__), '..', '..',
+                              'pandas', 'io', 'tests', 'sas', 'data',
+                              'test1.sas7bdat')
 
     def time_packers_read_sas7bdat(self):
         pd.read_sas(self.f, format='sas7bdat')
@@ -330,7 +332,9 @@ def time_packers_read_sas7bdat(self):
 class packers_read_xport(object):
 
     def setup(self):
-        self.f = 'data/paxraw_d_short.xpt'
+        self.f = os.path.join(os.path.dirname(__file__), '..', '..',
+                              'pandas', 'io', 'tests', 'sas', 'data',
+                              'paxraw_d_short.xpt')
 
     def time_packers_read_xport(self):
         pd.read_sas(self.f, format='xport')
diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py
index d2ef65ed27ef3..7a4a98e2195c2 100644
--- a/asv_bench/benchmarks/plotting.py
+++ b/asv_bench/benchmarks/plotting.py
@@ -11,6 +11,8 @@ class plot_timeseries_period(object):
     goal_time = 0.2
 
     def setup(self):
+        import matplotlib
+        matplotlib.use('Agg')
         self.N = 2000
         self.M = 5
         self.df = DataFrame(np.random.randn(self.N, self.M), index=date_range('1/1/1975', periods=self.N))
@@ -18,10 +20,13 @@ def setup(self):
     def time_plot_timeseries_period(self):
         self.df.plot()
 
+
 class plot_andrews_curves(object):
     goal_time = 0.6
 
     def setup(self):
+        import matplotlib
+        matplotlib.use('Agg')
         self.N = 500
         self.M = 10
         data_dict = {x: np.random.randn(self.N) for x in range(self.M)}

From 65db647cc74b85457fbb5ec959447e252ccb0a6d Mon Sep 17 00:00:00 2001
From: Pauli Virtanen <pav@iki.fi>
Date: Tue, 8 Mar 2016 22:28:00 +0200
Subject: [PATCH 3/4] CLN: more precise asv_bench ignores in .gitignore

---
 .gitignore | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 06ff0b90b8345..d987bab6fd5d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -83,7 +83,10 @@ scikits
 
 # Performance Testing #
 #######################
-asv_bench/
+asv_bench/env/
+asv_bench/html/
+asv_bench/results/
+asv_bench/pandas/
 
 # Documentation generated files #
 #################################

From 8cba84daf19a319db2d6c93c11ec71ed499e7b4c Mon Sep 17 00:00:00 2001
From: Pauli Virtanen <pav@iki.fi>
Date: Sun, 24 Apr 2016 17:10:05 +0200
Subject: [PATCH 4/4] DOC: contributing: explain how to tell asv which
 environment to use in more detail

---
 doc/source/contributing.rst | 38 +++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst
index 29e7cc96dcc87..e64ff4c155132 100644
--- a/doc/source/contributing.rst
+++ b/doc/source/contributing.rst
@@ -547,8 +547,8 @@ with an imported pandas to run tests similarly.
 Running the performance test suite
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Performance matters and it is worth considering whether your code has introduced
-performance regressions.  *pandas* is in the process of migrating to the
-`asv library <https://github.com/spacetelescope/asv>`__
+performance regressions.  *pandas* is in the process of migrating to
+`asv benchmarks <https://github.com/spacetelescope/asv>`__
 to enable easy monitoring of the performance of critical *pandas* operations.
 These benchmarks are all found in the ``pandas/asv_bench`` directory.  asv
 supports both python2 and python3.
@@ -559,8 +559,9 @@ supports both python2 and python3.
     so many stylistic issues are likely a result of automated transformation of the
     code.
 
-To use asv you will need either ``conda`` or ``virtualenv``. For more details
-please check the `asv installation webpage <http://asv.readthedocs.org/en/latest/installing.html>`_.
+To use all features of asv, you will need either ``conda`` or
+``virtualenv``. For more details please check the `asv installation
+webpage <http://asv.readthedocs.org/en/latest/installing.html>`_.
 
 To install asv::
 
@@ -571,6 +572,14 @@ the following if you have been developing on ``master``::
 
     asv continuous master
 
+This command uses ``conda`` by default for creating the benchmark
+environments. If you want to use virtualenv instead, write::
+
+    asv continuous -E virtualenv master
+
+The ``-E virtualenv`` option should be added to all ``asv`` commands
+that run benchmarks. The default value is defined in ``asv.conf.json``.
+
 If you are working on another branch, either of the following can be used::
 
     asv continuous master HEAD
@@ -595,17 +604,26 @@ using ``.`` as a separator. For example::
 
 will only run a ``groupby_agg_builtins1`` test defined in a ``groupby`` file.
 
-It can also be useful to run tests in your current environment. You can simply do it by::
+You can also run the benchmark suite using the version of ``pandas``
+already installed in your current Python environment. This can be
+useful if you do not have virtualenv or conda, or are using the
+``setup.py develop`` approach discussed above; for the in-place build
+you need to set ``PYTHONPATH``, e.g.
+``PYTHONPATH="$PWD/.." asv [remaining arguments]``.
+You can run benchmarks using an existing Python
+environment by::
 
-    asv dev
+    asv run -e -E existing
 
-This command is equivalent to::
+or, to use a specific Python interpreter,::
 
-    asv run --quick --show-stderr --python=same
+    asv run -e -E existing:python3.5
 
-This will launch every test only once, display stderr from the benchmarks, and use your local ``python`` that comes from your ``$PATH``.
+This will display stderr from the benchmarks, and use your local
+``python`` that comes from your ``$PATH``.
 
-Information on how to write a benchmark can be found in the `asv documentation <http://asv.readthedocs.org/en/latest/writing_benchmarks.html>`_.
+Information on how to write a benchmark and how to use asv can be found in the
+`asv documentation <http://asv.readthedocs.org/en/latest/writing_benchmarks.html>`_.
 
 Running the vbench performance test suite (phasing out)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~