pandas-dev
diff --git a/‎bench/bench_with_subset.R
Lines changed: 53 additions & 0 deletions b/‎bench/bench_with_subset.R
Lines changed: 53 additions & 0 deletions
diff --git a/‎bench/bench_with_subset.py
Lines changed: 37 additions & 0 deletions b/‎bench/bench_with_subset.py
Lines changed: 37 additions & 0 deletions
diff --git a/‎doc/source/api.rst
Lines changed: 1 addition & 0 deletions b/‎doc/source/api.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/source/comparison_with_r.rst
Lines changed: 81 additions & 15 deletions b/‎doc/source/comparison_with_r.rst
Lines changed: 81 additions & 15 deletions
@@ -0,0 +1,53 @@
+library(microbenchmark)
+library(data.table)
+
+
+data.frame.subset.bench <- function (n=1e7, times=30) {
+    df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n))
+    print(microbenchmark(subset(df, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c),
+                         times=times))
+}
+
+
+# data.table allows something very similar to query with an expression
+# but we have chained comparisons AND we're faster BOO YAH!
+data.table.subset.expression.bench <- function (n=1e7, times=30) {
+    dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n))
+    print(microbenchmark(dt[, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c],
+                         times=times))
+}
+
+
+# compare against subset with data.table for good measure
+data.table.subset.bench <- function (n=1e7, times=30) {
+    dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n))
+    print(microbenchmark(subset(dt, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c),
+                         times=times))
+}
+
+
+data.frame.with.bench <- function (n=1e7, times=30) {
+    df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n))
+
+    print(microbenchmark(with(df, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3),
+                         times=times))
+}
+
+
+data.table.with.bench <- function (n=1e7, times=30) {
+    dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n))
+    print(microbenchmark(with(dt, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3),
+                         times=times))
+}
+
+
+bench <- function () {
+    data.frame.subset.bench()
+    data.table.subset.expression.bench()
+    data.table.subset.bench()
+    data.frame.with.bench()
+    data.table.with.bench()
+}
+
+
+bench()
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+"""
+Microbenchmarks for comparison with R's "with" and "subset" functions
+"""
+
+from __future__ import print_function
+from timeit import timeit
+
+
+def bench_with(n=1e7, times=10, repeat=3):
+    setup = "from pandas import DataFrame\n"
+    setup += "from numpy.random import randn\n"
+    setup += "df = DataFrame(randn(%d, 3), columns=list('abc'))\n" % n
+    setup += "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'"
+    print('DataFrame.eval:')
+    print(timeit('df.eval(s)', setup=setup, repeat=repeat, number=times))
+
+
+def bench_subset(n=1e7, times=10, repeat=3):
+    setup = "from pandas import DataFrame\n"
+    setup += "from numpy.random import randn\n"
+    setup += "df = DataFrame(randn(%d, 3), columns=list('abc'))\n" % n
+    setup += "s = 'a <= b <= (c ** 2 + b ** 2 - a) and b > c'"
+    print('DataFrame.query:')
+    print(timeit('df.query(s)', setup=setup, repeat=repeat, number=times))
+    print('DataFrame.__getitem__:')
+    print(timeit('df[s]', setup=setup, repeat=repeat, number=times))
+
+
+def bench():
+    bench_with()
+    bench_subset()
+
+
+if __name__ == '__main__':
+    bench()
@@ -514,6 +514,7 @@ Computations / Descriptive Stats
    DataFrame.cumsum
    DataFrame.describe
    DataFrame.diff
+   DataFrame.eval
    DataFrame.kurt
    DataFrame.mad
    DataFrame.max
 
@@ -1,28 +1,88 @@
 .. currentmodule:: pandas
 .. _compare_with_r:
 
-*******************************
 Comparison with R / R libraries
 *******************************
 
-Since pandas aims to provide a lot of the data manipulation and analysis
-functionality that people use R for, this page was started to provide a more
-detailed look at the R language and it's many 3rd party libraries as they
-relate to pandas. In offering comparisons with R and CRAN libraries, we care
-about the following things:
+Since ``pandas`` aims to provide a lot of the data manipulation and analysis
+functionality that people use `R <http://www.r-project.org/>`__ for, this page
+was started to provide a more detailed look at the `R language
+<http://en.wikipedia.org/wiki/R_(programming_language)>`__ and its many third
+party libraries as they relate to ``pandas``. In comparisons with R and CRAN
+libraries, we care about the following things:
 
-  - **Functionality / flexibility**: what can / cannot be done with each tool
-  - **Performance**: how fast are operations. Hard numbers / benchmarks are
+  - **Functionality / flexibility**: what can/cannot be done with each tool
+  - **Performance**: how fast are operations. Hard numbers/benchmarks are
     preferable
-  - **Ease-of-use**: is one tool easier or harder to use (you may have to be
-    the judge of this given side-by-side code comparisons)
+  - **Ease-of-use**: Is one tool easier/harder to use (you may have to be
+    the judge of this, given side-by-side code comparisons)
+
+This page is also here to offer a bit of a translation guide for users of these
+R packages.
+
+Base R
+------
+
+|subset|_
+~~~~~~~~~~
+
+.. versionadded:: 0.13
+
+The :meth:`~pandas.DataFrame.query` method is similar to the base R ``subset``
+function. In R you might want to get the rows of a ``data.frame`` where one
+column's values are less than another column's values:
+
+    .. code-block:: r
+
+       df <- data.frame(a=rnorm(10), b=rnorm(10))
+       subset(df, a <= b)
+       df[df$a <= df$b,]  # note the comma
+
+In ``pandas``, there are a few ways to perform subsetting. You can use
+:meth:`~pandas.DataFrame.query` or pass an expression as if it were an
+index/slice as well as standard boolean indexing:
+
+    .. ipython:: python
+
+       from pandas import DataFrame
+       from numpy.random import randn
+
+       df = DataFrame({'a': randn(10), 'b': randn(10)})
+       df.query('a <= b')
+       df['a <= b']
+       df[df.a <= df.b]
+       df.loc[df.a <= df.b]
 
-As I do not have an encyclopedic knowledge of R packages, feel free to suggest
-additional CRAN packages to add to this list. This is also here to offer a big
-of a translation guide for users of these R packages.
+For more details and examples see :ref:`the query documentation
+<indexing.query>`.
 
-data.frame
-----------
+
+|with|_
+~~~~~~~~
+
+.. versionadded:: 0.13
+
+An expression using a data.frame called ``df`` in R with the columns ``a`` and
+``b`` would be evaluated using ``with`` like so:
+
+    .. code-block:: r
+
+       df <- data.frame(a=rnorm(10), b=rnorm(10))
+       with(df, a + b)
+       df$a + df$b  # same as the previous expression
+
+In ``pandas`` the equivalent expression, using the
+:meth:`~pandas.DataFrame.eval` method, would be:
+
+    .. ipython:: python
+
+       df = DataFrame({'a': randn(10), 'b': randn(10)})
+       df.eval('a + b')
+       df.a + df.b  # same as the previous expression
+
+In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than
+evaluation in pure Python. For more details and examples see :ref:`the eval
+documentation <enhancingperf.eval>`.
 
 zoo
 ---
@@ -36,3 +96,9 @@ plyr
 reshape / reshape2
 ------------------
 
+
+.. |with| replace:: ``with``
+.. _with: http://finzi.psych.upenn.edu/R/library/base/html/with.html
+
+.. |subset| replace:: ``subset``
+.. _subset: http://finzi.psych.upenn.edu/R/library/base/html/subset.html