pandas-dev · jreback · Mar 9, 2013 · Feb 25, 2013 · Mar 8, 2013 · Mar 9, 2013
diff --git a/README.rst b/README.rst
@@ -70,6 +70,12 @@ Dependencies
   * `pytz <http://pytz.sourceforge.net/>`__
      * Needed for time zone support with ``date_range``
 
+Highly Recommended Dependencies
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  * `numexpr <http://code.google.com/p/numexpr/>`__: to accelerate some expression evaluation operations
+       also required by `PyTables`
+  * `bottleneck <http://berkeleyanalytics.com/>`__: to accelerate certain numerical operations
+
 Optional dependencies
 ~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt
@@ -224,6 +224,12 @@ API changes
 Enhancements
 ~~~~~~~~~~~~
 
+  - Numexpr is now a 'highly recommended dependency', to accelerate certain
+    types of expression evaluation
+
+  - Bottleneck is now a 'highly recommended dependency', to accelerate certain
+    types of numerical evaluations
+
   - In ``HDFStore``, provide dotted attribute access to ``get`` from stores
     (e.g. ``store.df == store['df']``)
 

diff --git a/pandas/core/expressions.py b/pandas/core/expressions.py
@@ -0,0 +1,129 @@
+"""
+Expressions
+-----------
+
+Offer fast expression evaluation thru numexpr
+
+"""
+import numpy as np
+
+try:
+    import numexpr as ne
+    _NUMEXPR_INSTALLED = True
+except ImportError:  # pragma: no cover
+    _NUMEXPR_INSTALLED = False
+
+_USE_NUMEXPR = _NUMEXPR_INSTALLED
+_evaluate    = None
+
+# the set of dtypes that we will allow pass to numexpr
+_ALLOWED_DTYPES = set(['int64','int32','float64','float32','bool'])
+
+# the minimum prod shape that we will use numexpr
+_MIN_ELEMENTS   = 10000
+
+def set_use_numexpr(v = True):
+    # set/unset to use numexpr
+    global _USE_NUMEXPR
+    if _NUMEXPR_INSTALLED:
+        #print "setting use_numexpr : was->%s, now->%s" % (_USE_NUMEXPR,v)
+        _USE_NUMEXPR = v
+
+    # choose what we are going to do
+    global _evaluate
+    if not _USE_NUMEXPR:
+        _evaluate = _evaluate_standard
+    else:
+        _evaluate = _evaluate_numexpr
+
+    #print "evaluate -> %s" % _evaluate
+
+def set_numexpr_threads(n = None):
+    # if we are using numexpr, set the threads to n
+    # otherwise reset
+    try:
+        if _NUMEXPR_INSTALLED and _USE_NUMEXPR:
+            if n is None:
+                n = ne.detect_number_of_cores()
+            ne.set_num_threads(n)
+    except:
+        pass
+
+
+def _evaluate_standard(op, op_str, a, b, raise_on_error=True):
+    """ standard evaluation """
+    return op(a,b)
+
+def _can_use_numexpr(op, op_str, a, b):
+    """ return a boolean if we WILL be using numexpr """
+    if op_str is not None:
+
+        # required min elements (otherwise we are adding overhead)
+        if np.prod(a.shape) > _MIN_ELEMENTS:
+
+            # check for dtype compatiblity
+            dtypes = set()
+            for o in [ a, b ]:
+                if hasattr(o,'get_dtype_counts'):
+                    s = o.get_dtype_counts()
+                    if len(s) > 1:
+                        return False
+                    dtypes |= set(s.index)
+                elif isinstance(o,np.ndarray):
+                    dtypes |= set([o.dtype.name])
+
+            # allowed are a superset
+            if not len(dtypes) or _ALLOWED_DTYPES >= dtypes:
+                return True
+
+    return False
+
+def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False):
+    result = None
+
+    if _can_use_numexpr(op, op_str, a, b):
+        try:
+            a_value, b_value = a, b
+            if hasattr(a_value,'values'):
+                a_value = a_value.values
+            if hasattr(b_value,'values'):
+                b_value = b_value.values
+            result = ne.evaluate('a_value %s b_value' % op_str, 
+                                 local_dict={ 'a_value' : a_value, 
+                                              'b_value' : b_value }, 
+                                 casting='safe')
+        except (ValueError), detail:
+            if 'unknown type object' in str(detail):
+                pass
+        except (Exception), detail:
+            if raise_on_error:
+                raise TypeError(str(detail))
+
+    if result is None:
+        result = _evaluate_standard(op,op_str,a,b,raise_on_error)
+
+    return result
+
+# turn myself on
+set_use_numexpr(True)
+
+def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True):
+    """ evaluate and return the expression of the op on a and b
+
+        Parameters
+        ----------
+
+        op :    the actual operand
+        op_str: the string version of the op
+        a :     left operand
+        b :     right operand
+        raise_on_error : pass the error to the higher level if indicated (default is False),
+                         otherwise evaluate the op with and return the results
+        use_numexpr : whether to try to use numexpr (default True)
+        """
+
+    if use_numexpr:
+        return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error)
+    return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error)
+
+
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -31,6 +31,7 @@
                                   _is_index_slice, _check_bool_indexer)
 from pandas.core.internals import BlockManager, make_block, form_blocks
 from pandas.core.series import Series, _radd_compat
+import pandas.core.expressions as expressions
 from pandas.compat.scipy import scoreatpercentile as _quantile
 from pandas.util.compat import OrderedDict
 from pandas.util import py3compat
@@ -53,7 +54,6 @@
 
 from pandas.core.config import get_option
 
-
 #----------------------------------------------------------------------
 # Docstring templates
 
@@ -186,10 +186,10 @@ class DataConflictError(Exception):
 # Factory helper methods
 
 
-def _arith_method(op, name, default_axis='columns'):
+def _arith_method(op, name, str_rep = None, default_axis='columns'):
     def na_op(x, y):
         try:
-            result = op(x, y)
+            result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True)
         except TypeError:
             xrav = x.ravel()
             result = np.empty(x.size, dtype=x.dtype)
@@ -240,7 +240,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
     return f
 
 
-def _flex_comp_method(op, name, default_axis='columns'):
+def _flex_comp_method(op, name, str_rep = None, default_axis='columns'):
 
     def na_op(x, y):
         try:
@@ -268,7 +268,7 @@ def na_op(x, y):
     @Appender('Wrapper for flexible comparison methods %s' % name)
     def f(self, other, axis=default_axis, level=None):
         if isinstance(other, DataFrame):    # Another DataFrame
-            return self._flex_compare_frame(other, na_op, level)
+            return self._flex_compare_frame(other, na_op, str_rep, level)
 
         elif isinstance(other, Series):
             return self._combine_series(other, na_op, None, axis, level)
@@ -294,7 +294,7 @@ def f(self, other, axis=default_axis, level=None):
                 casted = DataFrame(other, index=self.index,
                                    columns=self.columns)
 
-                return self._flex_compare_frame(casted, na_op, level)
+                return self._flex_compare_frame(casted, na_op, str_rep, level)
 
             else:  # pragma: no cover
                 raise ValueError("Bad argument shape")
@@ -307,11 +307,11 @@ def f(self, other, axis=default_axis, level=None):
     return f
 
 
-def _comp_method(func, name):
+def _comp_method(func, name, str_rep):
     @Appender('Wrapper for comparison method %s' % name)
     def f(self, other):
         if isinstance(other, DataFrame):    # Another DataFrame
-            return self._compare_frame(other, func)
+            return self._compare_frame(other, func, str_rep)
         elif isinstance(other, Series):
             return self._combine_series_infer(other, func)
         else:
@@ -750,26 +750,26 @@ def __contains__(self, key):
     #----------------------------------------------------------------------
     # Arithmetic methods
 
-    add = _arith_method(operator.add, 'add')
-    mul = _arith_method(operator.mul, 'multiply')
-    sub = _arith_method(operator.sub, 'subtract')
-    div = divide = _arith_method(lambda x, y: x / y, 'divide')
-    pow = _arith_method(operator.pow, 'pow')
+    add = _arith_method(operator.add, 'add', '+')
+    mul = _arith_method(operator.mul, 'multiply', '*')
+    sub = _arith_method(operator.sub, 'subtract', '-')
+    div = divide = _arith_method(lambda x, y: x / y, 'divide', '/')
+    pow = _arith_method(operator.pow, 'pow', '**')
 
     radd = _arith_method(_radd_compat, 'radd')
     rmul = _arith_method(operator.mul, 'rmultiply')
     rsub = _arith_method(lambda x, y: y - x, 'rsubtract')
     rdiv = _arith_method(lambda x, y: y / x, 'rdivide')
     rpow = _arith_method(lambda x, y: y ** x, 'rpow')
 
-    __add__ = _arith_method(operator.add, '__add__', default_axis=None)
-    __sub__ = _arith_method(operator.sub, '__sub__', default_axis=None)
-    __mul__ = _arith_method(operator.mul, '__mul__', default_axis=None)
-    __truediv__ = _arith_method(operator.truediv, '__truediv__',
+    __add__ = _arith_method(operator.add, '__add__', '+', default_axis=None)
+    __sub__ = _arith_method(operator.sub, '__sub__', '-', default_axis=None)
+    __mul__ = _arith_method(operator.mul, '__mul__', '*', default_axis=None)
+    __truediv__ = _arith_method(operator.truediv, '__truediv__', '/',
                                 default_axis=None)
     __floordiv__ = _arith_method(operator.floordiv, '__floordiv__',
                                  default_axis=None)
-    __pow__ = _arith_method(operator.pow, '__pow__', default_axis=None)
+    __pow__ = _arith_method(operator.pow, '__pow__', '**', default_axis=None)
 
     __radd__ = _arith_method(_radd_compat, '__radd__', default_axis=None)
     __rmul__ = _arith_method(operator.mul, '__rmul__', default_axis=None)
@@ -782,13 +782,13 @@ def __contains__(self, key):
                              default_axis=None)
 
     # boolean operators
-    __and__ = _arith_method(operator.and_, '__and__')
-    __or__ = _arith_method(operator.or_, '__or__')
+    __and__ = _arith_method(operator.and_, '__and__', '&')
+    __or__ = _arith_method(operator.or_, '__or__', '|')
     __xor__ = _arith_method(operator.xor, '__xor__')
 
     # Python 2 division methods
     if not py3compat.PY3:
-        __div__ = _arith_method(operator.div, '__div__', default_axis=None)
+        __div__ = _arith_method(operator.div, '__div__', '/', default_axis=None)
         __rdiv__ = _arith_method(lambda x, y: y / x, '__rdiv__',
                                  default_axis=None)
 
@@ -801,19 +801,19 @@ def __invert__(self):
         return self._wrap_array(arr, self.axes, copy=False)
 
     # Comparison methods
-    __eq__ = _comp_method(operator.eq, '__eq__')
-    __ne__ = _comp_method(operator.ne, '__ne__')
-    __lt__ = _comp_method(operator.lt, '__lt__')
-    __gt__ = _comp_method(operator.gt, '__gt__')
-    __le__ = _comp_method(operator.le, '__le__')
-    __ge__ = _comp_method(operator.ge, '__ge__')
-
-    eq = _flex_comp_method(operator.eq, 'eq')
-    ne = _flex_comp_method(operator.ne, 'ne')
-    gt = _flex_comp_method(operator.gt, 'gt')
-    lt = _flex_comp_method(operator.lt, 'lt')
-    ge = _flex_comp_method(operator.ge, 'ge')
-    le = _flex_comp_method(operator.le, 'le')
+    __eq__ = _comp_method(operator.eq, '__eq__', '==')
+    __ne__ = _comp_method(operator.ne, '__ne__', '!=')
+    __lt__ = _comp_method(operator.lt, '__lt__', '<' )
+    __gt__ = _comp_method(operator.gt, '__gt__', '>' )
+    __le__ = _comp_method(operator.le, '__le__', '<=')
+    __ge__ = _comp_method(operator.ge, '__ge__', '>=')
+
+    eq = _flex_comp_method(operator.eq, 'eq', '==')
+    ne = _flex_comp_method(operator.ne, 'ne', '!=')
+    lt = _flex_comp_method(operator.lt, 'lt', '<')
+    gt = _flex_comp_method(operator.gt, 'gt', '>')
+    le = _flex_comp_method(operator.le, 'le', '<=')
+    ge = _flex_comp_method(operator.ge, 'ge', '>=')
 
     def dot(self, other):
         """
@@ -1669,14 +1669,6 @@ def convert_objects(self, convert_dates=True, convert_numeric=False):
         """
         return self._constructor(self._data.convert(convert_dates=convert_dates, convert_numeric=convert_numeric))
 
-    def get_dtype_counts(self):
-        """ return the counts of dtypes in this frame """
-        self._consolidate_inplace()
-        counts = dict()
-        for b in self._data.blocks:
-            counts[b.dtype.name] = counts.get(b.dtype,0) + b.shape[0]
-        return Series(counts)
-
     #----------------------------------------------------------------------
     # properties for index and columns
 
@@ -3710,25 +3702,25 @@ def _combine_const(self, other, func, raise_on_error = True):
         new_data = self._data.eval(func, other, raise_on_error=raise_on_error)
         return self._constructor(new_data)
 
-    def _compare_frame(self, other, func):
+    def _compare_frame(self, other, func, str_rep):
         if not self._indexed_same(other):
             raise Exception('Can only compare identically-labeled '
                             'DataFrame objects')
 
-        new_data = {}
-        for col in self.columns:
-            new_data[col] = func(self[col], other[col])
+        def _compare(a, b):
+            return dict([ (col,func(a[col], b[col])) for col in a.columns ])
+        new_data = expressions.evaluate(_compare, str_rep, self, other)
 
         return self._constructor(data=new_data, index=self.index,
                                  columns=self.columns, copy=False)
 
-    def _flex_compare_frame(self, other, func, level):
+    def _flex_compare_frame(self, other, func, str_rep, level):
         if not self._indexed_same(other):
             self, other = self.align(other, 'outer', level=level)
 
-        new_data = {}
-        for col in self.columns:
-            new_data[col] = func(self[col], other[col])
+        def _compare(a, b):
+            return dict([ (col,func(a[col], b[col])) for col in a.columns ])
+        new_data = expressions.evaluate(_compare, str_rep, self, other)
 
         return self._constructor(data=new_data, index=self.index,
                                  columns=self.columns, copy=False)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -606,6 +606,11 @@ def __delitem__(self, key):
         except KeyError:
             pass
 
+    def get_dtype_counts(self):
+        """ return the counts of dtypes in this frame """
+        from pandas import Series
+        return Series(self._data.get_dtype_counts())
+
     def pop(self, item):
         """
         Return item and drop from frame. Raise KeyError if not found.