Skip to content

ENH: allow in-line expression assignment with df.eval #5343

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 27, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions doc/source/enhancingperf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -441,18 +441,27 @@ The ``DataFrame.eval`` method (Experimental)
In addition to the top level :func:`~pandas.eval` function you can also
evaluate an expression in the "context" of a ``DataFrame``.


.. ipython:: python

df = DataFrame(randn(5, 2), columns=['a', 'b'])
df.eval('a + b')


Any expression that is a valid :func:`~pandas.eval` expression is also a valid
``DataFrame.eval`` expression, with the added benefit that *you don't have to
prefix the name of the* ``DataFrame`` *to the column you're interested in
evaluating*.

In addition, you can perform in-line assignment of columns within an expression.
This can allow for *formulaic evaluation*. Only a signle assignement is permitted.
It can be a new column name or an existing column name. It must be a string-like.

.. ipython:: python

df = DataFrame(dict(a = range(5), b = range(5,10)))
df.eval('c=a+b')
df.eval('d=a+b+c')
df.eval('a=1')
df

Local Variables
~~~~~~~~~~~~~~~
Expand Down
3 changes: 2 additions & 1 deletion doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ Experimental Features
``numexpr`` behind the scenes. This results in large speedups for complicated
expressions involving large DataFrames/Series.
- :class:`~pandas.DataFrame` has a new :meth:`~pandas.DataFrame.eval` that
evaluates an expression in the context of the ``DataFrame``.
evaluates an expression in the context of the ``DataFrame``; allows
inline expression assignment
- A :meth:`~pandas.DataFrame.query` method has been added that allows
you to select elements of a ``DataFrame`` using a natural query syntax nearly
identical to Python syntax.
Expand Down
13 changes: 11 additions & 2 deletions pandas/computation/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,8 @@ def _convert_expression(expr):


def eval(expr, parser='pandas', engine='numexpr', truediv=True,
local_dict=None, global_dict=None, resolvers=None, level=2):
local_dict=None, global_dict=None, resolvers=None, level=2,
target=None):
"""Evaluate a Python expression as a string using various backends.

The following arithmetic operations are supported: ``+``, ``-``, ``*``,
Expand Down Expand Up @@ -169,6 +170,8 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True,
level : int, optional
The number of prior stack frames to traverse and add to the current
scope. Most users will **not** need to change this parameter.
target : a target object for assignment, optional, default is None
essentially this is a passed in resolver

Returns
-------
Expand All @@ -194,7 +197,7 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True,

# get our (possibly passed-in) scope
env = _ensure_scope(global_dict=global_dict, local_dict=local_dict,
resolvers=resolvers, level=level)
resolvers=resolvers, level=level, target=target)

parsed_expr = Expr(expr, engine=engine, parser=parser, env=env,
truediv=truediv)
Expand All @@ -203,4 +206,10 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True,
eng = _engines[engine]
eng_inst = eng(parsed_expr)
ret = eng_inst.evaluate()

# assign if needed
if env.target is not None and parsed_expr.assigner is not None:
env.target[parsed_expr.assigner] = ret
return None

return ret
56 changes: 45 additions & 11 deletions pandas/computation/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,14 @@
_arith_ops_syms, _unary_ops_syms, is_term)
from pandas.computation.ops import _reductions, _mathops, _LOCAL_TAG
from pandas.computation.ops import Op, BinOp, UnaryOp, Term, Constant, Div
from pandas.computation.ops import UndefinedVariableError


def _ensure_scope(level=2, global_dict=None, local_dict=None, resolvers=None,
**kwargs):
target=None, **kwargs):
"""Ensure that we are grabbing the correct scope."""
return Scope(gbls=global_dict, lcls=local_dict, level=level,
resolvers=resolvers)
resolvers=resolvers, target=target)


def _check_disjoint_resolver_names(resolver_keys, local_keys, global_keys):
Expand Down Expand Up @@ -88,20 +89,23 @@ class Scope(StringMixin):
resolver_keys : frozenset
"""
__slots__ = ('globals', 'locals', 'resolvers', '_global_resolvers',
'resolver_keys', '_resolver', 'level', 'ntemps')
'resolver_keys', '_resolver', 'level', 'ntemps', 'target')

def __init__(self, gbls=None, lcls=None, level=1, resolvers=None):
def __init__(self, gbls=None, lcls=None, level=1, resolvers=None, target=None):
self.level = level
self.resolvers = tuple(resolvers or [])
self.globals = dict()
self.locals = dict()
self.target = target
self.ntemps = 1 # number of temporary variables in this scope

if isinstance(lcls, Scope):
ld, lcls = lcls, dict()
self.locals.update(ld.locals.copy())
self.globals.update(ld.globals.copy())
self.resolvers += ld.resolvers
if ld.target is not None:
self.target = ld.target
self.update(ld.level)

frame = sys._getframe(level)
Expand Down Expand Up @@ -130,9 +134,10 @@ def __init__(self, gbls=None, lcls=None, level=1, resolvers=None):

def __unicode__(self):
return com.pprint_thing("locals: {0}\nglobals: {0}\nresolvers: "
"{0}".format(list(self.locals.keys()),
list(self.globals.keys()),
list(self.resolver_keys)))
"{0}\ntarget: {0}".format(list(self.locals.keys()),
list(self.globals.keys()),
list(self.resolver_keys),
self.target))

def __getitem__(self, key):
return self.resolve(key, globally=False)
Expand Down Expand Up @@ -417,6 +422,7 @@ def __init__(self, env, engine, parser, preparser=_preparse):
self.engine = engine
self.parser = parser
self.preparser = preparser
self.assigner = None

def visit(self, node, **kwargs):
if isinstance(node, string_types):
Expand Down Expand Up @@ -575,9 +581,33 @@ def visit_Slice(self, node, **kwargs):
return slice(lower, upper, step)

def visit_Assign(self, node, **kwargs):
cmpr = ast.Compare(ops=[ast.Eq()], left=node.targets[0],
comparators=[node.value])
return self.visit(cmpr)
"""
support a single assignment node, like

c = a + b

set the assigner at the top level, must be a Name node which
might or might not exist in the resolvers

"""

if len(node.targets) != 1:
raise SyntaxError('can only assign a single expression')
if not isinstance(node.targets[0], ast.Name):
raise SyntaxError('left hand side of an assignment must be a single name')
if self.env.target is None:
raise ValueError('cannot assign without a target object')

try:
assigner = self.visit(node.targets[0], **kwargs)
except (UndefinedVariableError):
assigner = node.targets[0].id

self.assigner = getattr(assigner,'name',assigner)
if self.assigner is None:
raise SyntaxError('left hand side of an assignment must be a single resolvable name')

return self.visit(node.value, **kwargs)

def visit_Attribute(self, node, **kwargs):
attr = node.attr
Expand Down Expand Up @@ -669,7 +699,7 @@ def visitor(x, y):
return reduce(visitor, operands)


_python_not_supported = frozenset(['Assign', 'Dict', 'Call', 'BoolOp',
_python_not_supported = frozenset(['Dict', 'Call', 'BoolOp',
'In', 'NotIn'])
_numexpr_supported_calls = frozenset(_reductions + _mathops)

Expand Down Expand Up @@ -712,6 +742,10 @@ def __init__(self, expr, engine='numexpr', parser='pandas', env=None,
self.terms = self.parse()
self.truediv = truediv

@property
def assigner(self):
return getattr(self._visitor,'assigner',None)

def __call__(self):
self.env.locals['truediv'] = self.truediv
return self.terms(self.env)
Expand Down
5 changes: 5 additions & 0 deletions pandas/computation/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,11 @@ def visit_USub(self, node, **kwargs):
def visit_Index(self, node, **kwargs):
return self.visit(node.value).value

def visit_Assign(self, node, **kwargs):
cmpr = ast.Compare(ops=[ast.Eq()], left=node.targets[0],
comparators=[node.value])
return self.visit(cmpr)

def visit_Subscript(self, node, **kwargs):
value = self.visit(node.value)
slobj = self.visit(node.slice)
Expand Down
59 changes: 58 additions & 1 deletion pandas/computation/tests/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from pandas.computation.ops import (_binary_ops_dict, _unary_ops_dict,
_special_case_arith_ops_syms,
_arith_ops_syms, _bool_ops_syms)
from pandas.computation.common import NameResolutionError
import pandas.computation.expr as expr
import pandas.util.testing as tm
from pandas.util.testing import (assert_frame_equal, randbool,
Expand Down Expand Up @@ -1151,9 +1152,65 @@ def test_assignment_fails(self):
df = DataFrame(np.random.randn(5, 3), columns=list('abc'))
df2 = DataFrame(np.random.randn(5, 3))
expr1 = 'df = df2'
self.assertRaises(NotImplementedError, self.eval, expr1,
self.assertRaises(ValueError, self.eval, expr1,
local_dict={'df': df, 'df2': df2})

def test_assignment_column(self):
skip_if_no_ne('numexpr')
df = DataFrame(np.random.randn(5, 2), columns=list('ab'))
orig_df = df.copy()

# multiple assignees
self.assertRaises(SyntaxError, df.eval, 'd c = a + b')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be switched with test right below? This is not valid Python syntax while the one below is valid Python syntax, but not valid eval syntax.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll check but iirc it fails in the assign block with a left hand side that is a list with a length of 2

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

strange ... that should simply throw the usual syntax error and not do any parsing outside of Python

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you are right 'd c = a + b' raises a SyntaxError in fix_missing_locations....I think I meant the a = b = c, which ends up having multiple assignment nodes (which I raise a Syntax Error); though in theory in the future you could handle

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can u change the test to make sure the multiple assignment fails? again not meaning to be a troll....just want to make sure the correct error is checked


# invalid assignees
self.assertRaises(SyntaxError, df.eval, 'd,c = a + b')
self.assertRaises(SyntaxError, df.eval, 'Timestamp("20131001") = a + b')

# single assignment - existing variable
expected = orig_df.copy()
expected['a'] = expected['a'] + expected['b']
df = orig_df.copy()
df.eval('a = a + b')
assert_frame_equal(df,expected)

# single assignment - new variable
expected = orig_df.copy()
expected['c'] = expected['a'] + expected['b']
df = orig_df.copy()
df.eval('c = a + b')
assert_frame_equal(df,expected)

# with a local name overlap
def f():
df = orig_df.copy()
a = 1
df.eval('a = 1 + b')
return df

df = f()
expected = orig_df.copy()
expected['a'] = 1 + expected['b']
assert_frame_equal(df,expected)

df = orig_df.copy()
def f():
a = 1
df.eval('a=a+b')
self.assertRaises(NameResolutionError, f)

# multiple assignment
df = orig_df.copy()
df.eval('c = a + b')
self.assertRaises(SyntaxError, df.eval, 'c = a = b')
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cpcloud did you mean something additional besides this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh nope! sorry didn't see that 👍


# explicit targets
df = orig_df.copy()
self.eval('c = df.a + df.b', local_dict={'df' : df}, target=df)
expected = orig_df.copy()
expected['c'] = expected['a'] + expected['b']
assert_frame_equal(df,expected)

def test_basic_period_index_boolean_expression(self):
df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i')

Expand Down
2 changes: 2 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1797,12 +1797,14 @@ def eval(self, expr, **kwargs):
>>> from pandas import DataFrame
>>> df = DataFrame(randn(10, 2), columns=list('ab'))
>>> df.eval('a + b')
>>> df.eval('c=a + b')
"""
resolvers = kwargs.pop('resolvers', None)
if resolvers is None:
index_resolvers = self._get_resolvers()
resolvers = [self, index_resolvers]
kwargs['local_dict'] = _ensure_scope(resolvers=resolvers, **kwargs)
kwargs['target'] = self
return _eval(expr, **kwargs)

def _slice(self, slobj, axis=0, raise_on_error=False, typ=None):
Expand Down