Skip to content

Commit 13f54e5

Browse files
committed
Merge pull request #2925 from jreback/compare
ENH: numexpr on boolean frames
2 parents fbebe4f + e273828 commit 13f54e5

File tree

11 files changed

+485
-71
lines changed

11 files changed

+485
-71
lines changed

README.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,12 @@ Dependencies
7070
* `pytz <http://pytz.sourceforge.net/>`__
7171
* Needed for time zone support with ``date_range``
7272

73+
Highly Recommended Dependencies
74+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
75+
* `numexpr <http://code.google.com/p/numexpr/>`__: to accelerate some expression evaluation operations
76+
also required by `PyTables`
77+
* `bottleneck <http://berkeleyanalytics.com/>`__: to accelerate certain numerical operations
78+
7379
Optional dependencies
7480
~~~~~~~~~~~~~~~~~~~~~
7581

doc/source/v0.11.0.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,12 @@ API changes
224224
Enhancements
225225
~~~~~~~~~~~~
226226

227+
- Numexpr is now a 'highly recommended dependency', to accelerate certain
228+
types of expression evaluation
229+
230+
- Bottleneck is now a 'highly recommended dependency', to accelerate certain
231+
types of numerical evaluations
232+
227233
- In ``HDFStore``, provide dotted attribute access to ``get`` from stores
228234
(e.g. ``store.df == store['df']``)
229235

pandas/core/expressions.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""
2+
Expressions
3+
-----------
4+
5+
Offer fast expression evaluation thru numexpr
6+
7+
"""
8+
import numpy as np
9+
10+
try:
11+
import numexpr as ne
12+
_NUMEXPR_INSTALLED = True
13+
except ImportError: # pragma: no cover
14+
_NUMEXPR_INSTALLED = False
15+
16+
_USE_NUMEXPR = _NUMEXPR_INSTALLED
17+
_evaluate = None
18+
19+
# the set of dtypes that we will allow pass to numexpr
20+
_ALLOWED_DTYPES = set(['int64','int32','float64','float32','bool'])
21+
22+
# the minimum prod shape that we will use numexpr
23+
_MIN_ELEMENTS = 10000
24+
25+
def set_use_numexpr(v = True):
26+
# set/unset to use numexpr
27+
global _USE_NUMEXPR
28+
if _NUMEXPR_INSTALLED:
29+
#print "setting use_numexpr : was->%s, now->%s" % (_USE_NUMEXPR,v)
30+
_USE_NUMEXPR = v
31+
32+
# choose what we are going to do
33+
global _evaluate
34+
if not _USE_NUMEXPR:
35+
_evaluate = _evaluate_standard
36+
else:
37+
_evaluate = _evaluate_numexpr
38+
39+
#print "evaluate -> %s" % _evaluate
40+
41+
def set_numexpr_threads(n = None):
42+
# if we are using numexpr, set the threads to n
43+
# otherwise reset
44+
try:
45+
if _NUMEXPR_INSTALLED and _USE_NUMEXPR:
46+
if n is None:
47+
n = ne.detect_number_of_cores()
48+
ne.set_num_threads(n)
49+
except:
50+
pass
51+
52+
53+
def _evaluate_standard(op, op_str, a, b, raise_on_error=True):
54+
""" standard evaluation """
55+
return op(a,b)
56+
57+
def _can_use_numexpr(op, op_str, a, b):
58+
""" return a boolean if we WILL be using numexpr """
59+
if op_str is not None:
60+
61+
# required min elements (otherwise we are adding overhead)
62+
if np.prod(a.shape) > _MIN_ELEMENTS:
63+
64+
# check for dtype compatiblity
65+
dtypes = set()
66+
for o in [ a, b ]:
67+
if hasattr(o,'get_dtype_counts'):
68+
s = o.get_dtype_counts()
69+
if len(s) > 1:
70+
return False
71+
dtypes |= set(s.index)
72+
elif isinstance(o,np.ndarray):
73+
dtypes |= set([o.dtype.name])
74+
75+
# allowed are a superset
76+
if not len(dtypes) or _ALLOWED_DTYPES >= dtypes:
77+
return True
78+
79+
return False
80+
81+
def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False):
82+
result = None
83+
84+
if _can_use_numexpr(op, op_str, a, b):
85+
try:
86+
a_value, b_value = a, b
87+
if hasattr(a_value,'values'):
88+
a_value = a_value.values
89+
if hasattr(b_value,'values'):
90+
b_value = b_value.values
91+
result = ne.evaluate('a_value %s b_value' % op_str,
92+
local_dict={ 'a_value' : a_value,
93+
'b_value' : b_value },
94+
casting='safe')
95+
except (ValueError), detail:
96+
if 'unknown type object' in str(detail):
97+
pass
98+
except (Exception), detail:
99+
if raise_on_error:
100+
raise TypeError(str(detail))
101+
102+
if result is None:
103+
result = _evaluate_standard(op,op_str,a,b,raise_on_error)
104+
105+
return result
106+
107+
# turn myself on
108+
set_use_numexpr(True)
109+
110+
def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True):
111+
""" evaluate and return the expression of the op on a and b
112+
113+
Parameters
114+
----------
115+
116+
op : the actual operand
117+
op_str: the string version of the op
118+
a : left operand
119+
b : right operand
120+
raise_on_error : pass the error to the higher level if indicated (default is False),
121+
otherwise evaluate the op with and return the results
122+
use_numexpr : whether to try to use numexpr (default True)
123+
"""
124+
125+
if use_numexpr:
126+
return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error)
127+
return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error)
128+
129+

pandas/core/frame.py

Lines changed: 42 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
_is_index_slice, _check_bool_indexer)
3232
from pandas.core.internals import BlockManager, make_block, form_blocks
3333
from pandas.core.series import Series, _radd_compat
34+
import pandas.core.expressions as expressions
3435
from pandas.compat.scipy import scoreatpercentile as _quantile
3536
from pandas.util.compat import OrderedDict
3637
from pandas.util import py3compat
@@ -53,7 +54,6 @@
5354

5455
from pandas.core.config import get_option
5556

56-
5757
#----------------------------------------------------------------------
5858
# Docstring templates
5959

@@ -186,10 +186,10 @@ class DataConflictError(Exception):
186186
# Factory helper methods
187187

188188

189-
def _arith_method(op, name, default_axis='columns'):
189+
def _arith_method(op, name, str_rep = None, default_axis='columns'):
190190
def na_op(x, y):
191191
try:
192-
result = op(x, y)
192+
result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True)
193193
except TypeError:
194194
xrav = x.ravel()
195195
result = np.empty(x.size, dtype=x.dtype)
@@ -240,7 +240,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
240240
return f
241241

242242

243-
def _flex_comp_method(op, name, default_axis='columns'):
243+
def _flex_comp_method(op, name, str_rep = None, default_axis='columns'):
244244

245245
def na_op(x, y):
246246
try:
@@ -268,7 +268,7 @@ def na_op(x, y):
268268
@Appender('Wrapper for flexible comparison methods %s' % name)
269269
def f(self, other, axis=default_axis, level=None):
270270
if isinstance(other, DataFrame): # Another DataFrame
271-
return self._flex_compare_frame(other, na_op, level)
271+
return self._flex_compare_frame(other, na_op, str_rep, level)
272272

273273
elif isinstance(other, Series):
274274
return self._combine_series(other, na_op, None, axis, level)
@@ -294,7 +294,7 @@ def f(self, other, axis=default_axis, level=None):
294294
casted = DataFrame(other, index=self.index,
295295
columns=self.columns)
296296

297-
return self._flex_compare_frame(casted, na_op, level)
297+
return self._flex_compare_frame(casted, na_op, str_rep, level)
298298

299299
else: # pragma: no cover
300300
raise ValueError("Bad argument shape")
@@ -307,11 +307,11 @@ def f(self, other, axis=default_axis, level=None):
307307
return f
308308

309309

310-
def _comp_method(func, name):
310+
def _comp_method(func, name, str_rep):
311311
@Appender('Wrapper for comparison method %s' % name)
312312
def f(self, other):
313313
if isinstance(other, DataFrame): # Another DataFrame
314-
return self._compare_frame(other, func)
314+
return self._compare_frame(other, func, str_rep)
315315
elif isinstance(other, Series):
316316
return self._combine_series_infer(other, func)
317317
else:
@@ -750,26 +750,26 @@ def __contains__(self, key):
750750
#----------------------------------------------------------------------
751751
# Arithmetic methods
752752

753-
add = _arith_method(operator.add, 'add')
754-
mul = _arith_method(operator.mul, 'multiply')
755-
sub = _arith_method(operator.sub, 'subtract')
756-
div = divide = _arith_method(lambda x, y: x / y, 'divide')
757-
pow = _arith_method(operator.pow, 'pow')
753+
add = _arith_method(operator.add, 'add', '+')
754+
mul = _arith_method(operator.mul, 'multiply', '*')
755+
sub = _arith_method(operator.sub, 'subtract', '-')
756+
div = divide = _arith_method(lambda x, y: x / y, 'divide', '/')
757+
pow = _arith_method(operator.pow, 'pow', '**')
758758

759759
radd = _arith_method(_radd_compat, 'radd')
760760
rmul = _arith_method(operator.mul, 'rmultiply')
761761
rsub = _arith_method(lambda x, y: y - x, 'rsubtract')
762762
rdiv = _arith_method(lambda x, y: y / x, 'rdivide')
763763
rpow = _arith_method(lambda x, y: y ** x, 'rpow')
764764

765-
__add__ = _arith_method(operator.add, '__add__', default_axis=None)
766-
__sub__ = _arith_method(operator.sub, '__sub__', default_axis=None)
767-
__mul__ = _arith_method(operator.mul, '__mul__', default_axis=None)
768-
__truediv__ = _arith_method(operator.truediv, '__truediv__',
765+
__add__ = _arith_method(operator.add, '__add__', '+', default_axis=None)
766+
__sub__ = _arith_method(operator.sub, '__sub__', '-', default_axis=None)
767+
__mul__ = _arith_method(operator.mul, '__mul__', '*', default_axis=None)
768+
__truediv__ = _arith_method(operator.truediv, '__truediv__', '/',
769769
default_axis=None)
770770
__floordiv__ = _arith_method(operator.floordiv, '__floordiv__',
771771
default_axis=None)
772-
__pow__ = _arith_method(operator.pow, '__pow__', default_axis=None)
772+
__pow__ = _arith_method(operator.pow, '__pow__', '**', default_axis=None)
773773

774774
__radd__ = _arith_method(_radd_compat, '__radd__', default_axis=None)
775775
__rmul__ = _arith_method(operator.mul, '__rmul__', default_axis=None)
@@ -782,13 +782,13 @@ def __contains__(self, key):
782782
default_axis=None)
783783

784784
# boolean operators
785-
__and__ = _arith_method(operator.and_, '__and__')
786-
__or__ = _arith_method(operator.or_, '__or__')
785+
__and__ = _arith_method(operator.and_, '__and__', '&')
786+
__or__ = _arith_method(operator.or_, '__or__', '|')
787787
__xor__ = _arith_method(operator.xor, '__xor__')
788788

789789
# Python 2 division methods
790790
if not py3compat.PY3:
791-
__div__ = _arith_method(operator.div, '__div__', default_axis=None)
791+
__div__ = _arith_method(operator.div, '__div__', '/', default_axis=None)
792792
__rdiv__ = _arith_method(lambda x, y: y / x, '__rdiv__',
793793
default_axis=None)
794794

@@ -801,19 +801,19 @@ def __invert__(self):
801801
return self._wrap_array(arr, self.axes, copy=False)
802802

803803
# Comparison methods
804-
__eq__ = _comp_method(operator.eq, '__eq__')
805-
__ne__ = _comp_method(operator.ne, '__ne__')
806-
__lt__ = _comp_method(operator.lt, '__lt__')
807-
__gt__ = _comp_method(operator.gt, '__gt__')
808-
__le__ = _comp_method(operator.le, '__le__')
809-
__ge__ = _comp_method(operator.ge, '__ge__')
810-
811-
eq = _flex_comp_method(operator.eq, 'eq')
812-
ne = _flex_comp_method(operator.ne, 'ne')
813-
gt = _flex_comp_method(operator.gt, 'gt')
814-
lt = _flex_comp_method(operator.lt, 'lt')
815-
ge = _flex_comp_method(operator.ge, 'ge')
816-
le = _flex_comp_method(operator.le, 'le')
804+
__eq__ = _comp_method(operator.eq, '__eq__', '==')
805+
__ne__ = _comp_method(operator.ne, '__ne__', '!=')
806+
__lt__ = _comp_method(operator.lt, '__lt__', '<' )
807+
__gt__ = _comp_method(operator.gt, '__gt__', '>' )
808+
__le__ = _comp_method(operator.le, '__le__', '<=')
809+
__ge__ = _comp_method(operator.ge, '__ge__', '>=')
810+
811+
eq = _flex_comp_method(operator.eq, 'eq', '==')
812+
ne = _flex_comp_method(operator.ne, 'ne', '!=')
813+
lt = _flex_comp_method(operator.lt, 'lt', '<')
814+
gt = _flex_comp_method(operator.gt, 'gt', '>')
815+
le = _flex_comp_method(operator.le, 'le', '<=')
816+
ge = _flex_comp_method(operator.ge, 'ge', '>=')
817817

818818
def dot(self, other):
819819
"""
@@ -1669,14 +1669,6 @@ def convert_objects(self, convert_dates=True, convert_numeric=False):
16691669
"""
16701670
return self._constructor(self._data.convert(convert_dates=convert_dates, convert_numeric=convert_numeric))
16711671

1672-
def get_dtype_counts(self):
1673-
""" return the counts of dtypes in this frame """
1674-
self._consolidate_inplace()
1675-
counts = dict()
1676-
for b in self._data.blocks:
1677-
counts[b.dtype.name] = counts.get(b.dtype,0) + b.shape[0]
1678-
return Series(counts)
1679-
16801672
#----------------------------------------------------------------------
16811673
# properties for index and columns
16821674

@@ -3710,25 +3702,25 @@ def _combine_const(self, other, func, raise_on_error = True):
37103702
new_data = self._data.eval(func, other, raise_on_error=raise_on_error)
37113703
return self._constructor(new_data)
37123704

3713-
def _compare_frame(self, other, func):
3705+
def _compare_frame(self, other, func, str_rep):
37143706
if not self._indexed_same(other):
37153707
raise Exception('Can only compare identically-labeled '
37163708
'DataFrame objects')
37173709

3718-
new_data = {}
3719-
for col in self.columns:
3720-
new_data[col] = func(self[col], other[col])
3710+
def _compare(a, b):
3711+
return dict([ (col,func(a[col], b[col])) for col in a.columns ])
3712+
new_data = expressions.evaluate(_compare, str_rep, self, other)
37213713

37223714
return self._constructor(data=new_data, index=self.index,
37233715
columns=self.columns, copy=False)
37243716

3725-
def _flex_compare_frame(self, other, func, level):
3717+
def _flex_compare_frame(self, other, func, str_rep, level):
37263718
if not self._indexed_same(other):
37273719
self, other = self.align(other, 'outer', level=level)
37283720

3729-
new_data = {}
3730-
for col in self.columns:
3731-
new_data[col] = func(self[col], other[col])
3721+
def _compare(a, b):
3722+
return dict([ (col,func(a[col], b[col])) for col in a.columns ])
3723+
new_data = expressions.evaluate(_compare, str_rep, self, other)
37323724

37333725
return self._constructor(data=new_data, index=self.index,
37343726
columns=self.columns, copy=False)

pandas/core/generic.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,11 @@ def __delitem__(self, key):
606606
except KeyError:
607607
pass
608608

609+
def get_dtype_counts(self):
610+
""" return the counts of dtypes in this frame """
611+
from pandas import Series
612+
return Series(self._data.get_dtype_counts())
613+
609614
def pop(self, item):
610615
"""
611616
Return item and drop from frame. Raise KeyError if not found.

0 commit comments

Comments
 (0)