Skip to content

ENH: numexpr on boolean frames #2925

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 9, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ Dependencies
* `pytz <http://pytz.sourceforge.net/>`__
* Needed for time zone support with ``date_range``

Highly Recommended Dependencies
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* `numexpr <http://code.google.com/p/numexpr/>`__: to accelerate some expression evaluation operations
also required by `PyTables`
* `bottleneck <http://berkeleyanalytics.com/>`__: to accelerate certain numerical operations

Optional dependencies
~~~~~~~~~~~~~~~~~~~~~

Expand Down
6 changes: 6 additions & 0 deletions doc/source/v0.11.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,12 @@ API changes
Enhancements
~~~~~~~~~~~~

- Numexpr is now a 'highly recommended dependency', to accelerate certain
types of expression evaluation

- Bottleneck is now a 'highly recommended dependency', to accelerate certain
types of numerical evaluations

- In ``HDFStore``, provide dotted attribute access to ``get`` from stores
(e.g. ``store.df == store['df']``)

Expand Down
129 changes: 129 additions & 0 deletions pandas/core/expressions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""
Expressions
-----------

Offer fast expression evaluation thru numexpr

"""
import numpy as np

try:
import numexpr as ne
_NUMEXPR_INSTALLED = True
except ImportError: # pragma: no cover
_NUMEXPR_INSTALLED = False

_USE_NUMEXPR = _NUMEXPR_INSTALLED
_evaluate = None

# the set of dtypes that we will allow pass to numexpr
_ALLOWED_DTYPES = set(['int64','int32','float64','float32','bool'])

# the minimum prod shape that we will use numexpr
_MIN_ELEMENTS = 10000

def set_use_numexpr(v = True):
# set/unset to use numexpr
global _USE_NUMEXPR
if _NUMEXPR_INSTALLED:
#print "setting use_numexpr : was->%s, now->%s" % (_USE_NUMEXPR,v)
_USE_NUMEXPR = v

# choose what we are going to do
global _evaluate
if not _USE_NUMEXPR:
_evaluate = _evaluate_standard
else:
_evaluate = _evaluate_numexpr

#print "evaluate -> %s" % _evaluate

def set_numexpr_threads(n = None):
# if we are using numexpr, set the threads to n
# otherwise reset
try:
if _NUMEXPR_INSTALLED and _USE_NUMEXPR:
if n is None:
n = ne.detect_number_of_cores()
ne.set_num_threads(n)
except:
pass


def _evaluate_standard(op, op_str, a, b, raise_on_error=True):
""" standard evaluation """
return op(a,b)

def _can_use_numexpr(op, op_str, a, b):
""" return a boolean if we WILL be using numexpr """
if op_str is not None:

# required min elements (otherwise we are adding overhead)
if np.prod(a.shape) > _MIN_ELEMENTS:

# check for dtype compatiblity
dtypes = set()
for o in [ a, b ]:
if hasattr(o,'get_dtype_counts'):
s = o.get_dtype_counts()
if len(s) > 1:
return False
dtypes |= set(s.index)
elif isinstance(o,np.ndarray):
dtypes |= set([o.dtype.name])

# allowed are a superset
if not len(dtypes) or _ALLOWED_DTYPES >= dtypes:
return True

return False

def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False):
result = None

if _can_use_numexpr(op, op_str, a, b):
try:
a_value, b_value = a, b
if hasattr(a_value,'values'):
a_value = a_value.values
if hasattr(b_value,'values'):
b_value = b_value.values
result = ne.evaluate('a_value %s b_value' % op_str,
local_dict={ 'a_value' : a_value,
'b_value' : b_value },
casting='safe')
except (ValueError), detail:
if 'unknown type object' in str(detail):
pass
except (Exception), detail:
if raise_on_error:
raise TypeError(str(detail))

if result is None:
result = _evaluate_standard(op,op_str,a,b,raise_on_error)

return result

# turn myself on
set_use_numexpr(True)

def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True):
""" evaluate and return the expression of the op on a and b

Parameters
----------

op : the actual operand
op_str: the string version of the op
a : left operand
b : right operand
raise_on_error : pass the error to the higher level if indicated (default is False),
otherwise evaluate the op with and return the results
use_numexpr : whether to try to use numexpr (default True)
"""

if use_numexpr:
return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error)
return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error)


92 changes: 42 additions & 50 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
_is_index_slice, _check_bool_indexer)
from pandas.core.internals import BlockManager, make_block, form_blocks
from pandas.core.series import Series, _radd_compat
import pandas.core.expressions as expressions
from pandas.compat.scipy import scoreatpercentile as _quantile
from pandas.util.compat import OrderedDict
from pandas.util import py3compat
Expand All @@ -53,7 +54,6 @@

from pandas.core.config import get_option


#----------------------------------------------------------------------
# Docstring templates

Expand Down Expand Up @@ -186,10 +186,10 @@ class DataConflictError(Exception):
# Factory helper methods


def _arith_method(op, name, default_axis='columns'):
def _arith_method(op, name, str_rep = None, default_axis='columns'):
def na_op(x, y):
try:
result = op(x, y)
result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True)
except TypeError:
xrav = x.ravel()
result = np.empty(x.size, dtype=x.dtype)
Expand Down Expand Up @@ -240,7 +240,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
return f


def _flex_comp_method(op, name, default_axis='columns'):
def _flex_comp_method(op, name, str_rep = None, default_axis='columns'):

def na_op(x, y):
try:
Expand Down Expand Up @@ -268,7 +268,7 @@ def na_op(x, y):
@Appender('Wrapper for flexible comparison methods %s' % name)
def f(self, other, axis=default_axis, level=None):
if isinstance(other, DataFrame): # Another DataFrame
return self._flex_compare_frame(other, na_op, level)
return self._flex_compare_frame(other, na_op, str_rep, level)

elif isinstance(other, Series):
return self._combine_series(other, na_op, None, axis, level)
Expand All @@ -294,7 +294,7 @@ def f(self, other, axis=default_axis, level=None):
casted = DataFrame(other, index=self.index,
columns=self.columns)

return self._flex_compare_frame(casted, na_op, level)
return self._flex_compare_frame(casted, na_op, str_rep, level)

else: # pragma: no cover
raise ValueError("Bad argument shape")
Expand All @@ -307,11 +307,11 @@ def f(self, other, axis=default_axis, level=None):
return f


def _comp_method(func, name):
def _comp_method(func, name, str_rep):
@Appender('Wrapper for comparison method %s' % name)
def f(self, other):
if isinstance(other, DataFrame): # Another DataFrame
return self._compare_frame(other, func)
return self._compare_frame(other, func, str_rep)
elif isinstance(other, Series):
return self._combine_series_infer(other, func)
else:
Expand Down Expand Up @@ -750,26 +750,26 @@ def __contains__(self, key):
#----------------------------------------------------------------------
# Arithmetic methods

add = _arith_method(operator.add, 'add')
mul = _arith_method(operator.mul, 'multiply')
sub = _arith_method(operator.sub, 'subtract')
div = divide = _arith_method(lambda x, y: x / y, 'divide')
pow = _arith_method(operator.pow, 'pow')
add = _arith_method(operator.add, 'add', '+')
mul = _arith_method(operator.mul, 'multiply', '*')
sub = _arith_method(operator.sub, 'subtract', '-')
div = divide = _arith_method(lambda x, y: x / y, 'divide', '/')
pow = _arith_method(operator.pow, 'pow', '**')

radd = _arith_method(_radd_compat, 'radd')
rmul = _arith_method(operator.mul, 'rmultiply')
rsub = _arith_method(lambda x, y: y - x, 'rsubtract')
rdiv = _arith_method(lambda x, y: y / x, 'rdivide')
rpow = _arith_method(lambda x, y: y ** x, 'rpow')

__add__ = _arith_method(operator.add, '__add__', default_axis=None)
__sub__ = _arith_method(operator.sub, '__sub__', default_axis=None)
__mul__ = _arith_method(operator.mul, '__mul__', default_axis=None)
__truediv__ = _arith_method(operator.truediv, '__truediv__',
__add__ = _arith_method(operator.add, '__add__', '+', default_axis=None)
__sub__ = _arith_method(operator.sub, '__sub__', '-', default_axis=None)
__mul__ = _arith_method(operator.mul, '__mul__', '*', default_axis=None)
__truediv__ = _arith_method(operator.truediv, '__truediv__', '/',
default_axis=None)
__floordiv__ = _arith_method(operator.floordiv, '__floordiv__',
default_axis=None)
__pow__ = _arith_method(operator.pow, '__pow__', default_axis=None)
__pow__ = _arith_method(operator.pow, '__pow__', '**', default_axis=None)

__radd__ = _arith_method(_radd_compat, '__radd__', default_axis=None)
__rmul__ = _arith_method(operator.mul, '__rmul__', default_axis=None)
Expand All @@ -782,13 +782,13 @@ def __contains__(self, key):
default_axis=None)

# boolean operators
__and__ = _arith_method(operator.and_, '__and__')
__or__ = _arith_method(operator.or_, '__or__')
__and__ = _arith_method(operator.and_, '__and__', '&')
__or__ = _arith_method(operator.or_, '__or__', '|')
__xor__ = _arith_method(operator.xor, '__xor__')

# Python 2 division methods
if not py3compat.PY3:
__div__ = _arith_method(operator.div, '__div__', default_axis=None)
__div__ = _arith_method(operator.div, '__div__', '/', default_axis=None)
__rdiv__ = _arith_method(lambda x, y: y / x, '__rdiv__',
default_axis=None)

Expand All @@ -801,19 +801,19 @@ def __invert__(self):
return self._wrap_array(arr, self.axes, copy=False)

# Comparison methods
__eq__ = _comp_method(operator.eq, '__eq__')
__ne__ = _comp_method(operator.ne, '__ne__')
__lt__ = _comp_method(operator.lt, '__lt__')
__gt__ = _comp_method(operator.gt, '__gt__')
__le__ = _comp_method(operator.le, '__le__')
__ge__ = _comp_method(operator.ge, '__ge__')

eq = _flex_comp_method(operator.eq, 'eq')
ne = _flex_comp_method(operator.ne, 'ne')
gt = _flex_comp_method(operator.gt, 'gt')
lt = _flex_comp_method(operator.lt, 'lt')
ge = _flex_comp_method(operator.ge, 'ge')
le = _flex_comp_method(operator.le, 'le')
__eq__ = _comp_method(operator.eq, '__eq__', '==')
__ne__ = _comp_method(operator.ne, '__ne__', '!=')
__lt__ = _comp_method(operator.lt, '__lt__', '<' )
__gt__ = _comp_method(operator.gt, '__gt__', '>' )
__le__ = _comp_method(operator.le, '__le__', '<=')
__ge__ = _comp_method(operator.ge, '__ge__', '>=')

eq = _flex_comp_method(operator.eq, 'eq', '==')
ne = _flex_comp_method(operator.ne, 'ne', '!=')
lt = _flex_comp_method(operator.lt, 'lt', '<')
gt = _flex_comp_method(operator.gt, 'gt', '>')
le = _flex_comp_method(operator.le, 'le', '<=')
ge = _flex_comp_method(operator.ge, 'ge', '>=')

def dot(self, other):
"""
Expand Down Expand Up @@ -1669,14 +1669,6 @@ def convert_objects(self, convert_dates=True, convert_numeric=False):
"""
return self._constructor(self._data.convert(convert_dates=convert_dates, convert_numeric=convert_numeric))

def get_dtype_counts(self):
""" return the counts of dtypes in this frame """
self._consolidate_inplace()
counts = dict()
for b in self._data.blocks:
counts[b.dtype.name] = counts.get(b.dtype,0) + b.shape[0]
return Series(counts)

#----------------------------------------------------------------------
# properties for index and columns

Expand Down Expand Up @@ -3710,25 +3702,25 @@ def _combine_const(self, other, func, raise_on_error = True):
new_data = self._data.eval(func, other, raise_on_error=raise_on_error)
return self._constructor(new_data)

def _compare_frame(self, other, func):
def _compare_frame(self, other, func, str_rep):
if not self._indexed_same(other):
raise Exception('Can only compare identically-labeled '
'DataFrame objects')

new_data = {}
for col in self.columns:
new_data[col] = func(self[col], other[col])
def _compare(a, b):
return dict([ (col,func(a[col], b[col])) for col in a.columns ])
new_data = expressions.evaluate(_compare, str_rep, self, other)

return self._constructor(data=new_data, index=self.index,
columns=self.columns, copy=False)

def _flex_compare_frame(self, other, func, level):
def _flex_compare_frame(self, other, func, str_rep, level):
if not self._indexed_same(other):
self, other = self.align(other, 'outer', level=level)

new_data = {}
for col in self.columns:
new_data[col] = func(self[col], other[col])
def _compare(a, b):
return dict([ (col,func(a[col], b[col])) for col in a.columns ])
new_data = expressions.evaluate(_compare, str_rep, self, other)

return self._constructor(data=new_data, index=self.index,
columns=self.columns, copy=False)
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,11 @@ def __delitem__(self, key):
except KeyError:
pass

def get_dtype_counts(self):
""" return the counts of dtypes in this frame """
from pandas import Series
return Series(self._data.get_dtype_counts())

def pop(self, item):
"""
Return item and drop from frame. Raise KeyError if not found.
Expand Down
Loading