pandas-dev · jorisvandenbossche · Oct 28, 2022 · Sep 2, 2022 · Sep 2, 2022 · Sep 20, 2022
diff --git a/doc/scripts/eval_performance.py b/doc/scripts/eval_performance.py
@@ -0,0 +1,108 @@
+from timeit import repeat as timeit
+
+import numpy as np
+import seaborn as sns
+
+from pandas import DataFrame
+
+setup_common = """from pandas import DataFrame
+from numpy.random import randn
+df = DataFrame(randn(%d, 3), columns=list('abc'))
+%s"""
+
+setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'"
+
+
+def bench_with(n, times=10, repeat=3, engine="numexpr"):
+    return (
+        np.array(
+            timeit(
+                "df.eval(s, engine=%r)" % engine,
+                setup=setup_common % (n, setup_with),
+                repeat=repeat,
+                number=times,
+            )
+        )
+        / times
+    )
+
+
+setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'"
+
+
+def bench_subset(n, times=20, repeat=3, engine="numexpr"):
+    return (
+        np.array(
+            timeit(
+                "df.query(s, engine=%r)" % engine,
+                setup=setup_common % (n, setup_subset),
+                repeat=repeat,
+                number=times,
+            )
+        )
+        / times
+    )
+
+
+def bench(mn=3, mx=7, num=100, engines=("python", "numexpr"), verbose=False):
+    r = np.logspace(mn, mx, num=num).round().astype(int)
+
+    ev = DataFrame(np.empty((num, len(engines))), columns=engines)
+    qu = ev.copy(deep=True)
+
+    ev["size"] = qu["size"] = r
+
+    for engine in engines:
+        for i, n in enumerate(r):
+            if verbose & (i % 10 == 0):
+                print("engine: %r, i == %d" % (engine, i))
+            ev_times = bench_with(n, times=1, repeat=1, engine=engine)
+            ev.loc[i, engine] = np.mean(ev_times)
+            qu_times = bench_subset(n, times=1, repeat=1, engine=engine)
+            qu.loc[i, engine] = np.mean(qu_times)
+
+    return ev, qu
+
+
+def plot_perf(df, engines, title, filename=None):
+    from matplotlib.pyplot import figure
+
+    sns.set()
+    sns.set_palette("Set2")
+
+    fig = figure(figsize=(4, 3), dpi=120)
+    ax = fig.add_subplot(111)
+
+    for engine in engines:
+        ax.loglog(df["size"], df[engine], label=engine, lw=2)
+
+    ax.set_xlabel("Number of Rows")
+    ax.set_ylabel("Time (s)")
+    ax.set_title(title)
+    ax.legend(loc="best")
+    ax.tick_params(top=False, right=False)
+
+    fig.tight_layout()
+
+    if filename is not None:
+        fig.savefig(filename)
+
+
+if __name__ == "__main__":
+    import os
+
+    pandas_dir = os.path.dirname(
+        os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
+    )
+    static_path = os.path.join(pandas_dir, "doc", "source", "_static")
+
+    join = lambda p: os.path.join(static_path, p)
+
+    fn = join("eval-query-perf-data.h5")
+
+    engines = "python", "numexpr"
+
+    ev, qu = bench(verbose=True)  # only this one
+
+    plot_perf(ev, engines, "DataFrame.eval()", filename=join("eval-perf.png"))
+    plot_perf(qu, engines, "DataFrame.query()", filename=join("query-perf.png"))
diff --git a/doc/source/_static/eval-perf-small.png b/doc/source/_static/eval-perf-small.png
diff --git a/doc/source/_static/eval-perf.png b/doc/source/_static/eval-perf.png
diff --git a/doc/source/_static/query-perf-small.png b/doc/source/_static/query-perf-small.png
diff --git a/doc/source/_static/query-perf.png b/doc/source/_static/query-perf.png
diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst
@@ -690,21 +690,12 @@ The equivalent in standard Python would be
    df["a"] = 1
    df
 
-The :class:`DataFrame.query` method has a ``inplace`` keyword which determines
-whether the query modifies the original frame.
-
-.. ipython:: python
-
-   df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
-   df.query("a > 2")
-   df.query("a > 2", inplace=True)
-   df
-
 Local variables
 ~~~~~~~~~~~~~~~
 
 You must *explicitly reference* any local variable that you want to use in an
-expression by placing the ``@`` character in front of the name. For example,
+expression by placing the ``@`` character in front of the name. This mechanism is
+the same for both :meth:`DataFrame.query` and :meth:`DataFrame.eval`. For example,
 
 .. ipython:: python
 
@@ -820,17 +811,12 @@ significant performance benefit.  Here is a plot showing the running time of
 :func:`pandas.eval` as function of the size of the frame involved in the
 computation. The two lines are two different engines.
 
+..
+    The eval-perf.png figure below was generated with /doc/scripts/eval_performance.py
 
 .. image:: ../_static/eval-perf.png
 
-
-.. note::
-
-   Operations with smallish objects (around 15k-20k rows) are faster using
-   plain Python:
-
-       .. image:: ../_static/eval-perf-small.png
-
+You will only see the performance benefits of using the ``numexpr`` engine with :func:`pandas.eval` if your frame has more than approximately 100,000 rows.
 
 This plot was created using a :class:`DataFrame` with 3 columns each containing
 floating point values generated using ``numpy.random.randn()``.

diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -1240,6 +1240,17 @@ If instead you don't want to or cannot name your index, you can use the name
    renaming your columns to something less ambiguous.
 
 
+The :class:`DataFrame.query` method has a ``inplace`` keyword which determines
+whether the query modifies the original frame.
+
+.. ipython:: python
+
+   df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
+   df.query("a > 2")
+   df.query("a > 2", inplace=True)
+   df
+
+
 :class:`~pandas.MultiIndex` :meth:`~pandas.DataFrame.query` Syntax
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -1438,15 +1449,18 @@ Performance of :meth:`~pandas.DataFrame.query`
 ``DataFrame.query()`` using ``numexpr`` is slightly faster than Python for
 large frames.
 
+..
+    The eval-perf.png figure below was generated with /doc/scripts/eval_performance.py
+
 .. image:: ../_static/query-perf.png
 
-.. note::
 
-   You will only see the performance benefits of using the ``numexpr`` engine
-   with ``DataFrame.query()`` if your frame has more than approximately 200,000
-   rows.
 
-      .. image:: ../_static/query-perf-small.png
+You will only see the performance benefits of using the ``numexpr`` engine
+with ``DataFrame.query()`` if your frame has more than approximately 100,000
+rows.
+
+
 
 This plot was created using a ``DataFrame`` with 3 columns each containing
 floating point values generated using ``numpy.random.randn()``.