From a8dde5acdc7ad79beebfb93ca0c83b4e12898c0e Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 25 Feb 2013 08:33:23 -0500 Subject: [PATCH 1/3] ENH: numexpr on boolean frames BLD: add highly recommended depdencies section and setup.py changes to optionally build these ENH: add binary_ops vbench add binary_ops accelerations via ne created expressions.py core module ENH: added support for remainder of comp and compare methods added vbench for add/mult BUG: catching error in boolean comparisons if type is int and trying to put NA fixed failing tests from boolean comparisons removed radd operations from numexpr numexpr to now ignore a failing type operation and fail over TST: failing test because float16 not upcasted on 32-bit....weird TST: added test_expressions test suite added ability to turn off usage of numexpr (mainly for testing) restricted ops to allowed dtypes and min shape of input --- README.rst | 5 ++ doc/source/v0.11.0.txt | 6 ++ pandas/core/expressions.py | 101 ++++++++++++++++++++++++++++ pandas/core/frame.py | 92 ++++++++++++-------------- pandas/core/generic.py | 5 ++ pandas/core/internals.py | 24 ++++--- pandas/core/series.py | 3 + pandas/tests/test_expressions.py | 109 +++++++++++++++++++++++++++++++ pandas/tests/test_frame.py | 29 ++++---- setup.py | 2 + vb_suite/binary_ops.py | 30 +++++++++ vb_suite/indexing.py | 10 ++- 12 files changed, 345 insertions(+), 71 deletions(-) create mode 100644 pandas/core/expressions.py create mode 100644 pandas/tests/test_expressions.py diff --git a/README.rst b/README.rst index 66a7a3e2b627f..3093c235287b8 100644 --- a/README.rst +++ b/README.rst @@ -70,6 +70,11 @@ Dependencies * `pytz `__ * Needed for time zone support with ``date_range`` +Highly Recommended Dependencies +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * `numexpr `__: to accelerate some expression evaluation operations + * `bottleneck `__: to accelerate certain numerical operations + Optional dependencies ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index cd7540328230f..b75ea0e664f6d 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -224,6 +224,12 @@ API changes Enhancements ~~~~~~~~~~~~ + - Numexpr is now a 'highly recommended dependency', to accelerate certain + types of expression evaluation + + - Bottleneck is now a 'highly recommended dependency', to accelerate certain + types of numerical evaluations + - In ``HDFStore``, provide dotted attribute access to ``get`` from stores (e.g. ``store.df == store['df']``) diff --git a/pandas/core/expressions.py b/pandas/core/expressions.py new file mode 100644 index 0000000000000..3612026b21beb --- /dev/null +++ b/pandas/core/expressions.py @@ -0,0 +1,101 @@ +""" +Expressions +----------- + +Offer fast expression evaluation thru numexpr + +""" +import numpy as np + +try: + import numexpr as ne + _USE_NUMEXPR = True +except ImportError: # pragma: no cover + _USE_NUMEXPR = False + +# the set of dtypes that we will allow pass to numexpr +_ALLOWED_DTYPES = set(['int64','int32','float64','float32','bool']) + +# the minimum prod shape that we will use numexpr +_MIN_ELEMENTS = 10000 + +def _evaluate_standard(op, op_str, a, b, raise_on_error=True): + """ standard evaluation """ + return op(a,b) + +def _can_use_numexpr(op, op_str, a, b): + """ return a boolean if we WILL be using numexpr """ + if op_str is not None: + + # required min elements (otherwise we are adding overhead) + if np.prod(a.shape) > _MIN_ELEMENTS: + + # check for dtype compatiblity + dtypes = set() + for o in [ a, b ]: + if hasattr(o,'get_dtype_counts'): + s = o.get_dtype_counts() + if len(s) > 1: + return False + dtypes |= set(s.index) + elif isinstance(o,np.ndarray): + dtypes |= set([o.dtype.name]) + + # allowed are a superset + if not len(dtypes) or _ALLOWED_DTYPES >= dtypes: + return True + + return False + +def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False): + result = None + + if _can_use_numexpr(op, op_str, a, b): + try: + a_value, b_value = a, b + if hasattr(a_value,'values'): + a_value = a_value.values + if hasattr(b_value,'values'): + b_value = b_value.values + result = ne.evaluate('a_value %s b_value' % op_str, + local_dict={ 'a_value' : a_value, + 'b_value' : b_value }, + casting='safe') + except (ValueError), detail: + if 'unknown type object' in str(detail): + pass + except (Exception), detail: + if raise_on_error: + raise TypeError(str(detail)) + + if result is None: + result = _evaluate_standard(op,op_str,a,b,raise_on_error) + + return result + +# choose what we are going to do +if not _USE_NUMEXPR: + _evaluate = _evaluate_standard +else: + _evaluate = _evaluate_numexpr + +def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True): + """ evaluate and return the expression of the op on a and b + + Parameters + ---------- + + op : the actual operand + op_str: the string version of the op + a : left operand + b : right operand + raise_on_error : pass the error to the higher level if indicated (default is False), + otherwise evaluate the op with and return the results + use_numexpr : whether to try to use numexpr (default True) + """ + + if use_numexpr: + return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error) + return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error) + + diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 13ae446be5c0b..7ae2cc6d5b6ed 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -31,6 +31,7 @@ _is_index_slice, _check_bool_indexer) from pandas.core.internals import BlockManager, make_block, form_blocks from pandas.core.series import Series, _radd_compat +import pandas.core.expressions as expressions from pandas.compat.scipy import scoreatpercentile as _quantile from pandas.util.compat import OrderedDict from pandas.util import py3compat @@ -53,7 +54,6 @@ from pandas.core.config import get_option - #---------------------------------------------------------------------- # Docstring templates @@ -186,10 +186,10 @@ class DataConflictError(Exception): # Factory helper methods -def _arith_method(op, name, default_axis='columns'): +def _arith_method(op, name, str_rep = None, default_axis='columns'): def na_op(x, y): try: - result = op(x, y) + result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True) except TypeError: xrav = x.ravel() result = np.empty(x.size, dtype=x.dtype) @@ -240,7 +240,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): return f -def _flex_comp_method(op, name, default_axis='columns'): +def _flex_comp_method(op, name, str_rep = None, default_axis='columns'): def na_op(x, y): try: @@ -268,7 +268,7 @@ def na_op(x, y): @Appender('Wrapper for flexible comparison methods %s' % name) def f(self, other, axis=default_axis, level=None): if isinstance(other, DataFrame): # Another DataFrame - return self._flex_compare_frame(other, na_op, level) + return self._flex_compare_frame(other, na_op, str_rep, level) elif isinstance(other, Series): return self._combine_series(other, na_op, None, axis, level) @@ -294,7 +294,7 @@ def f(self, other, axis=default_axis, level=None): casted = DataFrame(other, index=self.index, columns=self.columns) - return self._flex_compare_frame(casted, na_op, level) + return self._flex_compare_frame(casted, na_op, str_rep, level) else: # pragma: no cover raise ValueError("Bad argument shape") @@ -307,11 +307,11 @@ def f(self, other, axis=default_axis, level=None): return f -def _comp_method(func, name): +def _comp_method(func, name, str_rep): @Appender('Wrapper for comparison method %s' % name) def f(self, other): if isinstance(other, DataFrame): # Another DataFrame - return self._compare_frame(other, func) + return self._compare_frame(other, func, str_rep) elif isinstance(other, Series): return self._combine_series_infer(other, func) else: @@ -750,11 +750,11 @@ def __contains__(self, key): #---------------------------------------------------------------------- # Arithmetic methods - add = _arith_method(operator.add, 'add') - mul = _arith_method(operator.mul, 'multiply') - sub = _arith_method(operator.sub, 'subtract') - div = divide = _arith_method(lambda x, y: x / y, 'divide') - pow = _arith_method(operator.pow, 'pow') + add = _arith_method(operator.add, 'add', '+') + mul = _arith_method(operator.mul, 'multiply', '*') + sub = _arith_method(operator.sub, 'subtract', '-') + div = divide = _arith_method(lambda x, y: x / y, 'divide', '/') + pow = _arith_method(operator.pow, 'pow', '**') radd = _arith_method(_radd_compat, 'radd') rmul = _arith_method(operator.mul, 'rmultiply') @@ -762,14 +762,14 @@ def __contains__(self, key): rdiv = _arith_method(lambda x, y: y / x, 'rdivide') rpow = _arith_method(lambda x, y: y ** x, 'rpow') - __add__ = _arith_method(operator.add, '__add__', default_axis=None) - __sub__ = _arith_method(operator.sub, '__sub__', default_axis=None) - __mul__ = _arith_method(operator.mul, '__mul__', default_axis=None) - __truediv__ = _arith_method(operator.truediv, '__truediv__', + __add__ = _arith_method(operator.add, '__add__', '+', default_axis=None) + __sub__ = _arith_method(operator.sub, '__sub__', '-', default_axis=None) + __mul__ = _arith_method(operator.mul, '__mul__', '*', default_axis=None) + __truediv__ = _arith_method(operator.truediv, '__truediv__', '/', default_axis=None) __floordiv__ = _arith_method(operator.floordiv, '__floordiv__', default_axis=None) - __pow__ = _arith_method(operator.pow, '__pow__', default_axis=None) + __pow__ = _arith_method(operator.pow, '__pow__', '**', default_axis=None) __radd__ = _arith_method(_radd_compat, '__radd__', default_axis=None) __rmul__ = _arith_method(operator.mul, '__rmul__', default_axis=None) @@ -782,13 +782,13 @@ def __contains__(self, key): default_axis=None) # boolean operators - __and__ = _arith_method(operator.and_, '__and__') - __or__ = _arith_method(operator.or_, '__or__') + __and__ = _arith_method(operator.and_, '__and__', '&') + __or__ = _arith_method(operator.or_, '__or__', '|') __xor__ = _arith_method(operator.xor, '__xor__') # Python 2 division methods if not py3compat.PY3: - __div__ = _arith_method(operator.div, '__div__', default_axis=None) + __div__ = _arith_method(operator.div, '__div__', '/', default_axis=None) __rdiv__ = _arith_method(lambda x, y: y / x, '__rdiv__', default_axis=None) @@ -801,19 +801,19 @@ def __invert__(self): return self._wrap_array(arr, self.axes, copy=False) # Comparison methods - __eq__ = _comp_method(operator.eq, '__eq__') - __ne__ = _comp_method(operator.ne, '__ne__') - __lt__ = _comp_method(operator.lt, '__lt__') - __gt__ = _comp_method(operator.gt, '__gt__') - __le__ = _comp_method(operator.le, '__le__') - __ge__ = _comp_method(operator.ge, '__ge__') - - eq = _flex_comp_method(operator.eq, 'eq') - ne = _flex_comp_method(operator.ne, 'ne') - gt = _flex_comp_method(operator.gt, 'gt') - lt = _flex_comp_method(operator.lt, 'lt') - ge = _flex_comp_method(operator.ge, 'ge') - le = _flex_comp_method(operator.le, 'le') + __eq__ = _comp_method(operator.eq, '__eq__', '==') + __ne__ = _comp_method(operator.ne, '__ne__', '!=') + __lt__ = _comp_method(operator.lt, '__lt__', '<' ) + __gt__ = _comp_method(operator.gt, '__gt__', '>' ) + __le__ = _comp_method(operator.le, '__le__', '<=') + __ge__ = _comp_method(operator.ge, '__ge__', '>=') + + eq = _flex_comp_method(operator.eq, 'eq', '==') + ne = _flex_comp_method(operator.ne, 'ne', '!=') + lt = _flex_comp_method(operator.lt, 'lt', '<') + gt = _flex_comp_method(operator.gt, 'gt', '>') + le = _flex_comp_method(operator.le, 'le', '<=') + ge = _flex_comp_method(operator.ge, 'ge', '>=') def dot(self, other): """ @@ -1669,14 +1669,6 @@ def convert_objects(self, convert_dates=True, convert_numeric=False): """ return self._constructor(self._data.convert(convert_dates=convert_dates, convert_numeric=convert_numeric)) - def get_dtype_counts(self): - """ return the counts of dtypes in this frame """ - self._consolidate_inplace() - counts = dict() - for b in self._data.blocks: - counts[b.dtype.name] = counts.get(b.dtype,0) + b.shape[0] - return Series(counts) - #---------------------------------------------------------------------- # properties for index and columns @@ -3710,25 +3702,25 @@ def _combine_const(self, other, func, raise_on_error = True): new_data = self._data.eval(func, other, raise_on_error=raise_on_error) return self._constructor(new_data) - def _compare_frame(self, other, func): + def _compare_frame(self, other, func, str_rep): if not self._indexed_same(other): raise Exception('Can only compare identically-labeled ' 'DataFrame objects') - new_data = {} - for col in self.columns: - new_data[col] = func(self[col], other[col]) + def _compare(a, b): + return dict([ (col,func(a[col], b[col])) for col in a.columns ]) + new_data = expressions.evaluate(_compare, str_rep, self, other) return self._constructor(data=new_data, index=self.index, columns=self.columns, copy=False) - def _flex_compare_frame(self, other, func, level): + def _flex_compare_frame(self, other, func, str_rep, level): if not self._indexed_same(other): self, other = self.align(other, 'outer', level=level) - new_data = {} - for col in self.columns: - new_data[col] = func(self[col], other[col]) + def _compare(a, b): + return dict([ (col,func(a[col], b[col])) for col in a.columns ]) + new_data = expressions.evaluate(_compare, str_rep, self, other) return self._constructor(data=new_data, index=self.index, columns=self.columns, copy=False) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c25e686afacbf..236d7d8aeadf8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -606,6 +606,11 @@ def __delitem__(self, key): except KeyError: pass + def get_dtype_counts(self): + """ return the counts of dtypes in this frame """ + from pandas import Series + return Series(self._data.get_dtype_counts()) + def pop(self, item): """ Return item and drop from frame. Raise KeyError if not found. diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 5bf918aff6367..75605fae4e39f 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -418,17 +418,17 @@ def eval(self, func, other, raise_on_error = True, try_cast = False): args = [ values, other ] try: result = func(*args) - except: + except (Exception), detail: if raise_on_error: - raise TypeError('Coulnd not operate %s with block values' - % repr(other)) + raise TypeError('Could not operate [%s] with block values [%s]' + % (repr(other),str(detail))) else: # return the values result = np.empty(values.shape,dtype='O') result.fill(np.nan) if not isinstance(result, np.ndarray): - raise TypeError('Could not compare %s with block values' + raise TypeError('Could not compare [%s] with block values' % repr(other)) if is_transposed: @@ -492,10 +492,10 @@ def func(c,v,o): try: return np.where(c,v,o) - except: + except (Exception), detail: if raise_on_error: - raise TypeError('Coulnd not operate %s with block values' - % repr(o)) + raise TypeError('Could not operate [%s] with block values [%s]' + % (repr(o),str(detail))) else: # return the values result = np.empty(v.shape,dtype='float64') @@ -504,7 +504,7 @@ def func(c,v,o): def create_block(result, items, transpose = True): if not isinstance(result, np.ndarray): - raise TypeError('Could not compare %s with block values' + raise TypeError('Could not compare [%s] with block values' % repr(other)) if transpose and is_transposed: @@ -843,6 +843,14 @@ def _get_items(self): return self.axes[0] items = property(fget=_get_items) + def get_dtype_counts(self): + """ return a dict of the counts of dtypes in BlockManager """ + self._consolidate_inplace() + counts = dict() + for b in self.blocks: + counts[b.dtype.name] = counts.get(b.dtype,0) + b.shape[0] + return counts + def __getstate__(self): block_values = [b.values for b in self.blocks] block_items = [b.items for b in self.blocks] diff --git a/pandas/core/series.py b/pandas/core/series.py index 27480d9e489be..2870bb1ab05b1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -695,6 +695,9 @@ def _get_values(self, indexer): except Exception: return self.values[indexer] + def get_dtype_counts(self): + return Series({ self.dtype.name : 1 }) + def where(self, cond, other=nan, inplace=False): """ Return a Series where cond is True; otherwise values are from other diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py new file mode 100644 index 0000000000000..cb852fc034c60 --- /dev/null +++ b/pandas/tests/test_expressions.py @@ -0,0 +1,109 @@ +# pylint: disable-msg=W0612,E1101 + +import unittest +import nose + +import operator +from numpy import random, nan +from numpy.random import randn +import numpy as np +from numpy.testing import assert_array_equal + +import pandas as pan +from pandas.core.api import DataFrame, Series, notnull, isnull +from pandas.core import expressions + +from pandas.util.testing import (assert_almost_equal, + assert_series_equal, + assert_frame_equal) +from pandas.util import py3compat + +import pandas.util.testing as tm +import pandas.lib as lib + +from numpy.testing.decorators import slow + +if not expressions._USE_NUMEXPR: + raise nose.SkipTest + +_frame = DataFrame(np.random.randn(10000, 4), columns = list('ABCD'), dtype='float64') +_frame2 = DataFrame(np.random.randn(100, 4), columns = list('ABCD'), dtype='float64') +_mixed = DataFrame({ 'A' : _frame['A'].copy(), 'B' : _frame['B'].astype('float32'), 'C' : _frame['C'].astype('int64'), 'D' : _frame['D'].astype('int32') }) +_mixed2 = DataFrame({ 'A' : _frame2['A'].copy(), 'B' : _frame2['B'].astype('float32'), 'C' : _frame2['C'].astype('int64'), 'D' : _frame2['D'].astype('int32') }) + +class TestExpressions(unittest.TestCase): + + _multiprocess_can_split_ = False + + def setUp(self): + + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + self.mixed = _mixed.copy() + self.mixed2 = _mixed2.copy() + + + def test_invalid(self): + + # no op + result = expressions._can_use_numexpr(operator.add, None, self.frame, self.frame) + self.assert_(result == False) + + # mixed + result = expressions._can_use_numexpr(operator.add, '+', self.mixed, self.frame) + self.assert_(result == False) + + # min elements + result = expressions._can_use_numexpr(operator.add, '+', self.frame2, self.frame2) + self.assert_(result == False) + + # ok, we only check on first part of expression + result = expressions._can_use_numexpr(operator.add, '+', self.frame, self.frame2) + self.assert_(result == True) + + def test_binary_ops(self): + + for f, f2 in [ (self.frame, self.frame2), (self.mixed, self.mixed2) ]: + + for op, op_str in [('add','+'),('sub','-'),('mul','*'),('div','/'),('pow','**')]: + + op = getattr(operator,op) + result = expressions._can_use_numexpr(op, op_str, f, f) + self.assert_(result == (not f._is_mixed_type)) + + result = expressions.evaluate(op, op_str, f, f, use_numexpr=True) + expected = expressions.evaluate(op, op_str, f, f, use_numexpr=False) + assert_array_equal(result,expected.values) + + result = expressions._can_use_numexpr(op, op_str, f2, f2) + self.assert_(result == False) + + def test_boolean_ops(self): + + for f, f2 in [ (self.frame, self.frame2), (self.mixed, self.mixed2) ]: + + f11 = f + f12 = f + 1 + + f21 = f2 + f22 = f2 + 1 + + for op, op_str in [('gt','>'),('lt','<'),('ge','>='),('le','<='),('eq','=='),('ne','!=')]: + + op = getattr(operator,op) + + result = expressions._can_use_numexpr(op, op_str, f11, f12) + self.assert_(result == (not f11._is_mixed_type)) + + result = expressions.evaluate(op, op_str, f11, f12, use_numexpr=True) + expected = expressions.evaluate(op, op_str, f11, f12, use_numexpr=False) + assert_array_equal(result,expected.values) + + result = expressions._can_use_numexpr(op, op_str, f21, f22) + self.assert_(result == False) + +if __name__ == '__main__': + # unittest.main() + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index c3f6b0b1640c3..0729f0e03782e 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -49,6 +49,8 @@ def _skip_if_no_scipy(): MIXED_INT_DTYPES = ['uint8','uint16','uint32','uint64','int8','int16','int32','int64'] def _check_mixed_float(df, dtype = None): + + # float16 are most likely to be upcasted to float32 dtypes = dict(A = 'float32', B = 'float32', C = 'float16', D = 'float64') if isinstance(dtype, basestring): dtypes = dict([ (k,dtype) for k, v in dtypes.items() ]) @@ -189,7 +191,7 @@ def test_setitem_list_of_tuples(self): result = self.frame['tuples'] expected = Series(tuples, index=self.frame.index) assert_series_equal(result, expected) - + def test_getitem_boolean(self): # boolean indexing d = self.tsframe.index[10] @@ -3933,7 +3935,7 @@ def test_arith_flex_frame(self): result = getattr(self.mixed_float, op)(2 * self.mixed_float) exp = f(self.mixed_float, 2 * self.mixed_float) assert_frame_equal(result, exp) - _check_mixed_float(result) + _check_mixed_float(result, dtype = dict(C = None)) # vs mix int if op in ['add','sub','mul']: @@ -3943,7 +3945,9 @@ def test_arith_flex_frame(self): # overflow in the uint dtype = None if op in ['sub']: - dtype = dict(B = 'object') + dtype = dict(B = 'object', C = None) + elif op in ['add','mul']: + dtype = dict(C = None) assert_frame_equal(result, exp) _check_mixed_int(result, dtype = dtype) @@ -4233,9 +4237,9 @@ def test_combineFrame(self): # mix vs mix added = self.mixed_float + self.mixed_float2 - _check_mixed_float(added) + _check_mixed_float(added, dtype = dict(C = None)) added = self.mixed_float2 + self.mixed_float - _check_mixed_float(added) + _check_mixed_float(added, dtype = dict(C = None)) # with int added = self.frame + self.mixed_int @@ -4265,15 +4269,16 @@ def test_combineSeries(self): added = self.mixed_float + series _check_mixed_float(added, dtype = 'float64') added = self.mixed_float + series.astype('float32') - _check_mixed_float(added, dtype = dict(C = 'float32')) + _check_mixed_float(added, dtype = dict(C = None)) added = self.mixed_float + series.astype('float16') - _check_mixed_float(added) + _check_mixed_float(added, dtype = dict(C = None)) + #### these raise with numexpr.....as we are adding an int64 to an uint64....weird # vs int - added = self.mixed_int + (100*series).astype('int64') - _check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C = 'int64', D = 'int64')) - added = self.mixed_int + (100*series).astype('int32') - _check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C = 'int32', D = 'int64')) + #added = self.mixed_int + (100*series).astype('int64') + #_check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C = 'int64', D = 'int64')) + #added = self.mixed_int + (100*series).astype('int32') + #_check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C = 'int32', D = 'int64')) # TimeSeries import sys @@ -4320,7 +4325,7 @@ def test_combineFunc(self): result = self.mixed_float * 2 for c, s in result.iteritems(): self.assert_(np.array_equal(s.values, self.mixed_float[c].values * 2)) - _check_mixed_float(result) + _check_mixed_float(result, dtype = dict(C = None)) result = self.empty * 2 self.assert_(result.index is self.empty.index) diff --git a/setup.py b/setup.py index 3c45d16f4024d..cade4a27320f8 100755 --- a/setup.py +++ b/setup.py @@ -60,6 +60,8 @@ 'install_requires': ['python-dateutil >= 2', 'pytz', 'numpy >= %s' % min_numpy_ver], + 'extras_require' : { 'numexpr' : ['numexpr>=1.4.2'], + 'bottleneck' : ['bottleneck>=0.5.0'] }, 'use_2to3_exclude_fixers': ['lib2to3.fixes.fix_next', ], } diff --git a/vb_suite/binary_ops.py b/vb_suite/binary_ops.py index b28d1d9ee0806..1bbcac4cae8de 100644 --- a/vb_suite/binary_ops.py +++ b/vb_suite/binary_ops.py @@ -3,3 +3,33 @@ common_setup = """from pandas_vb_common import * """ + +SECTION = 'Binary ops' + +#---------------------------------------------------------------------- +# binary ops + +setup = common_setup + """ +df = DataFrame(np.random.randn(100000, 100)) +df2 = DataFrame(np.random.randn(100000, 100)) +""" +frame_add = \ + Benchmark("df + df2", setup, name='frame_add', + start_date=datetime(2012, 1, 1)) + +setup = common_setup + """ +df = DataFrame(np.random.randn(100000, 100)) +df2 = DataFrame(np.random.randn(100000, 100)) +""" +frame_mult = \ + Benchmark("df * df2", setup, name='frame_mult', + start_date=datetime(2012, 1, 1)) + +setup = common_setup + """ +df = DataFrame(np.random.randn(100000, 100)) +df2 = DataFrame(np.random.randn(100000, 100)) +""" +frame_multi_and = \ + Benchmark("df[(df>0) & (df2>0)]", setup, name='frame_multi_and', + start_date=datetime(2012, 1, 1)) + diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py index 0c4898089a97f..09fb94688d287 100644 --- a/vb_suite/indexing.py +++ b/vb_suite/indexing.py @@ -83,7 +83,7 @@ # Boolean DataFrame row selection setup = common_setup + """ -df = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) +df = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) indexer = df['B'] > 0 obj_indexer = indexer.astype('O') """ @@ -94,6 +94,14 @@ Benchmark("df[obj_indexer]", setup, name='indexing_dataframe_boolean_rows_object') +setup = common_setup + """ +df = DataFrame(np.random.randn(100000, 100)) +df2 = DataFrame(np.random.randn(100000, 100)) +""" +indexing_dataframe_boolean = \ + Benchmark("df > df2", setup, name='indexing_dataframe_boolean', + start_date=datetime(2012, 1, 1)) + #---------------------------------------------------------------------- # MultiIndex sortlevel From 385ff82eabe41684af4441a539640f402f879b74 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 8 Mar 2013 16:23:33 -0500 Subject: [PATCH 2/3] ENH: added ability to use single or multi-threads in numexp testing TST: updating test_expressions to test with 1 and num_cores ENH: added module level function in core/expressions to change use of numexpr BUG: using set_use_numexpr now changes the evaluation functionsx --- pandas/core/expressions.py | 42 ++++++++++++--- pandas/tests/test_expressions.py | 91 ++++++++++++++++++++------------ vb_suite/binary_ops.py | 70 ++++++++++++++++++++++++ vb_suite/indexing.py | 22 ++++++++ 4 files changed, 183 insertions(+), 42 deletions(-) diff --git a/pandas/core/expressions.py b/pandas/core/expressions.py index 3612026b21beb..4199c6f7f890c 100644 --- a/pandas/core/expressions.py +++ b/pandas/core/expressions.py @@ -9,9 +9,12 @@ try: import numexpr as ne - _USE_NUMEXPR = True + _NUMEXPR_INSTALLED = True except ImportError: # pragma: no cover - _USE_NUMEXPR = False + _NUMEXPR_INSTALLED = False + +_USE_NUMEXPR = _NUMEXPR_INSTALLED +_evaluate = None # the set of dtypes that we will allow pass to numexpr _ALLOWED_DTYPES = set(['int64','int32','float64','float32','bool']) @@ -19,6 +22,34 @@ # the minimum prod shape that we will use numexpr _MIN_ELEMENTS = 10000 +def set_use_numexpr(v = True): + # set/unset to use numexpr + global _USE_NUMEXPR + if _NUMEXPR_INSTALLED: + #print "setting use_numexpr : was->%s, now->%s" % (_USE_NUMEXPR,v) + _USE_NUMEXPR = v + + # choose what we are going to do + global _evaluate + if not _USE_NUMEXPR: + _evaluate = _evaluate_standard + else: + _evaluate = _evaluate_numexpr + + #print "evaluate -> %s" % _evaluate + +def set_numexpr_threads(n = None): + # if we are using numexpr, set the threads to n + # otherwise reset + try: + if _NUMEXPR_INSTALLED and _USE_NUMEXPR: + if n is None: + n = ne.detect_number_of_cores() + ne.set_num_threads(n) + except: + pass + + def _evaluate_standard(op, op_str, a, b, raise_on_error=True): """ standard evaluation """ return op(a,b) @@ -73,11 +104,8 @@ def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False): return result -# choose what we are going to do -if not _USE_NUMEXPR: - _evaluate = _evaluate_standard -else: - _evaluate = _evaluate_numexpr +# turn myself on +set_use_numexpr(True) def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True): """ evaluate and return the expression of the op on a and b diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index cb852fc034c60..a0321d2dbe55f 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -11,7 +11,7 @@ import pandas as pan from pandas.core.api import DataFrame, Series, notnull, isnull -from pandas.core import expressions +from pandas.core import expressions as expr from pandas.util.testing import (assert_almost_equal, assert_series_equal, @@ -23,7 +23,7 @@ from numpy.testing.decorators import slow -if not expressions._USE_NUMEXPR: +if not expr._USE_NUMEXPR: raise nose.SkipTest _frame = DataFrame(np.random.randn(10000, 4), columns = list('ABCD'), dtype='float64') @@ -46,61 +46,82 @@ def setUp(self): def test_invalid(self): # no op - result = expressions._can_use_numexpr(operator.add, None, self.frame, self.frame) + result = expr._can_use_numexpr(operator.add, None, self.frame, self.frame) self.assert_(result == False) # mixed - result = expressions._can_use_numexpr(operator.add, '+', self.mixed, self.frame) + result = expr._can_use_numexpr(operator.add, '+', self.mixed, self.frame) self.assert_(result == False) # min elements - result = expressions._can_use_numexpr(operator.add, '+', self.frame2, self.frame2) + result = expr._can_use_numexpr(operator.add, '+', self.frame2, self.frame2) self.assert_(result == False) # ok, we only check on first part of expression - result = expressions._can_use_numexpr(operator.add, '+', self.frame, self.frame2) + result = expr._can_use_numexpr(operator.add, '+', self.frame, self.frame2) self.assert_(result == True) def test_binary_ops(self): - for f, f2 in [ (self.frame, self.frame2), (self.mixed, self.mixed2) ]: + def testit(): - for op, op_str in [('add','+'),('sub','-'),('mul','*'),('div','/'),('pow','**')]: + for f, f2 in [ (self.frame, self.frame2), (self.mixed, self.mixed2) ]: - op = getattr(operator,op) - result = expressions._can_use_numexpr(op, op_str, f, f) - self.assert_(result == (not f._is_mixed_type)) + for op, op_str in [('add','+'),('sub','-'),('mul','*'),('div','/'),('pow','**')]: - result = expressions.evaluate(op, op_str, f, f, use_numexpr=True) - expected = expressions.evaluate(op, op_str, f, f, use_numexpr=False) - assert_array_equal(result,expected.values) + op = getattr(operator,op) + result = expr._can_use_numexpr(op, op_str, f, f) + self.assert_(result == (not f._is_mixed_type)) + + result = expr.evaluate(op, op_str, f, f, use_numexpr=True) + expected = expr.evaluate(op, op_str, f, f, use_numexpr=False) + assert_array_equal(result,expected.values) - result = expressions._can_use_numexpr(op, op_str, f2, f2) - self.assert_(result == False) + result = expr._can_use_numexpr(op, op_str, f2, f2) + self.assert_(result == False) + + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() def test_boolean_ops(self): - for f, f2 in [ (self.frame, self.frame2), (self.mixed, self.mixed2) ]: - - f11 = f - f12 = f + 1 - - f21 = f2 - f22 = f2 + 1 - - for op, op_str in [('gt','>'),('lt','<'),('ge','>='),('le','<='),('eq','=='),('ne','!=')]: - op = getattr(operator,op) + def testit(): + for f, f2 in [ (self.frame, self.frame2), (self.mixed, self.mixed2) ]: - result = expressions._can_use_numexpr(op, op_str, f11, f12) - self.assert_(result == (not f11._is_mixed_type)) - - result = expressions.evaluate(op, op_str, f11, f12, use_numexpr=True) - expected = expressions.evaluate(op, op_str, f11, f12, use_numexpr=False) - assert_array_equal(result,expected.values) - - result = expressions._can_use_numexpr(op, op_str, f21, f22) - self.assert_(result == False) + f11 = f + f12 = f + 1 + + f21 = f2 + f22 = f2 + 1 + + for op, op_str in [('gt','>'),('lt','<'),('ge','>='),('le','<='),('eq','=='),('ne','!=')]: + + op = getattr(operator,op) + + result = expr._can_use_numexpr(op, op_str, f11, f12) + self.assert_(result == (not f11._is_mixed_type)) + + result = expr.evaluate(op, op_str, f11, f12, use_numexpr=True) + expected = expr.evaluate(op, op_str, f11, f12, use_numexpr=False) + assert_array_equal(result,expected.values) + + result = expr._can_use_numexpr(op, op_str, f21, f22) + self.assert_(result == False) + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() if __name__ == '__main__': # unittest.main() diff --git a/vb_suite/binary_ops.py b/vb_suite/binary_ops.py index 1bbcac4cae8de..7a2b03643dc46 100644 --- a/vb_suite/binary_ops.py +++ b/vb_suite/binary_ops.py @@ -9,6 +9,9 @@ #---------------------------------------------------------------------- # binary ops +#---------------------------------------------------------------------- +# add + setup = common_setup + """ df = DataFrame(np.random.randn(100000, 100)) df2 = DataFrame(np.random.randn(100000, 100)) @@ -17,6 +20,30 @@ Benchmark("df + df2", setup, name='frame_add', start_date=datetime(2012, 1, 1)) +setup = common_setup + """ +import pandas.core.expressions as expr +df = DataFrame(np.random.randn(100000, 100)) +df2 = DataFrame(np.random.randn(100000, 100)) +expr.set_numexpr_threads(1) +""" + +frame_add_st = \ + Benchmark("df + df2", setup, name='frame_add_st',cleanup="expr.set_numexpr_threads()", + start_date=datetime(2012, 1, 1)) + +setup = common_setup + """ +import pandas.core.expressions as expr +df = DataFrame(np.random.randn(100000, 100)) +df2 = DataFrame(np.random.randn(100000, 100)) +expr.set_use_numexpr(False) +""" +frame_add_no_ne = \ + Benchmark("df + df2", setup, name='frame_add_no_ne',cleanup="expr.set_use_numexpr(True)", + start_date=datetime(2012, 1, 1)) + +#---------------------------------------------------------------------- +# mult + setup = common_setup + """ df = DataFrame(np.random.randn(100000, 100)) df2 = DataFrame(np.random.randn(100000, 100)) @@ -25,6 +52,29 @@ Benchmark("df * df2", setup, name='frame_mult', start_date=datetime(2012, 1, 1)) +setup = common_setup + """ +import pandas.core.expressions as expr +df = DataFrame(np.random.randn(100000, 100)) +df2 = DataFrame(np.random.randn(100000, 100)) +expr.set_numexpr_threads(1) +""" +frame_mult_st = \ + Benchmark("df * df2", setup, name='frame_mult_st',cleanup="expr.set_numexpr_threads()", + start_date=datetime(2012, 1, 1)) + +setup = common_setup + """ +import pandas.core.expressions as expr +df = DataFrame(np.random.randn(100000, 100)) +df2 = DataFrame(np.random.randn(100000, 100)) +expr.set_use_numexpr(False) +""" +frame_mult_no_ne = \ + Benchmark("df * df2", setup, name='frame_mult_no_ne',cleanup="expr.set_use_numexpr(True)", + start_date=datetime(2012, 1, 1)) + +#---------------------------------------------------------------------- +# multi and + setup = common_setup + """ df = DataFrame(np.random.randn(100000, 100)) df2 = DataFrame(np.random.randn(100000, 100)) @@ -33,3 +83,23 @@ Benchmark("df[(df>0) & (df2>0)]", setup, name='frame_multi_and', start_date=datetime(2012, 1, 1)) +setup = common_setup + """ +import pandas.core.expressions as expr +df = DataFrame(np.random.randn(100000, 100)) +df2 = DataFrame(np.random.randn(100000, 100)) +expr.set_numexpr_threads(1) +""" +frame_multi_and_st = \ + Benchmark("df[(df>0) & (df2>0)]", setup, name='frame_multi_and_st',cleanup="expr.set_numexpr_threads()", + start_date=datetime(2012, 1, 1)) + +setup = common_setup + """ +import pandas.core.expressions as expr +df = DataFrame(np.random.randn(100000, 100)) +df2 = DataFrame(np.random.randn(100000, 100)) +expr.set_use_numexpr(False) +""" +frame_multi_and_no_ne = \ + Benchmark("df[(df>0) & (df2>0)]", setup, name='frame_multi_and_no_ne',cleanup="expr.set_use_numexpr(True)", + start_date=datetime(2012, 1, 1)) + diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py index 09fb94688d287..ceda346fd3e57 100644 --- a/vb_suite/indexing.py +++ b/vb_suite/indexing.py @@ -102,6 +102,28 @@ Benchmark("df > df2", setup, name='indexing_dataframe_boolean', start_date=datetime(2012, 1, 1)) +setup = common_setup + """ +import pandas.core.expressions as expr +df = DataFrame(np.random.randn(100000, 100)) +df2 = DataFrame(np.random.randn(100000, 100)) +expr.set_numexpr_threads(1) +""" + +indexing_dataframe_boolean_st = \ + Benchmark("df > df2", setup, name='indexing_dataframe_boolean_st',cleanup="expr.set_numexpr_threads()", + start_date=datetime(2012, 1, 1)) + +setup = common_setup + """ +import pandas.core.expressions as expr +df = DataFrame(np.random.randn(100000, 100)) +df2 = DataFrame(np.random.randn(100000, 100)) +expr.set_use_numexpr(False) +""" + +indexing_dataframe_boolean_no_ne = \ + Benchmark("df > df2", setup, name='indexing_dataframe_boolean_no_ne',cleanup="expr.set_use_numexpr(True)", + start_date=datetime(2012, 1, 1)) + #---------------------------------------------------------------------- # MultiIndex sortlevel From e273828b46570b397cccdbf79d07a388306cd117 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 9 Mar 2013 13:55:02 -0500 Subject: [PATCH 3/3] CLN: remove setup.py changes --- README.rst | 1 + setup.py | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 3093c235287b8..d6d1fa3ad658b 100644 --- a/README.rst +++ b/README.rst @@ -73,6 +73,7 @@ Dependencies Highly Recommended Dependencies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * `numexpr `__: to accelerate some expression evaluation operations + also required by `PyTables` * `bottleneck `__: to accelerate certain numerical operations Optional dependencies diff --git a/setup.py b/setup.py index cade4a27320f8..3c45d16f4024d 100755 --- a/setup.py +++ b/setup.py @@ -60,8 +60,6 @@ 'install_requires': ['python-dateutil >= 2', 'pytz', 'numpy >= %s' % min_numpy_ver], - 'extras_require' : { 'numexpr' : ['numexpr>=1.4.2'], - 'bottleneck' : ['bottleneck>=0.5.0'] }, 'use_2to3_exclude_fixers': ['lib2to3.fixes.fix_next', ], }