From 44c37b97d0b1fa34c8570daa69e14335963d4317 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 26 Oct 2013 18:20:14 -0400 Subject: [PATCH 1/2] ENH: allow in-line expression assignment with df.eval TST: tests for local name overlaps ENH: moved assign to visit_Assign from visit_Module --- doc/source/release.rst | 3 +- pandas/computation/eval.py | 6 ++++ pandas/computation/expr.py | 49 ++++++++++++++++++++++++--- pandas/computation/pytables.py | 5 +++ pandas/computation/tests/test_eval.py | 40 ++++++++++++++++++++++ 5 files changed, 98 insertions(+), 5 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index b74b23029a2ac..cfb47873863b8 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -72,7 +72,8 @@ Experimental Features ``numexpr`` behind the scenes. This results in large speedups for complicated expressions involving large DataFrames/Series. - :class:`~pandas.DataFrame` has a new :meth:`~pandas.DataFrame.eval` that - evaluates an expression in the context of the ``DataFrame``. + evaluates an expression in the context of the ``DataFrame``; allows + inline expression assignment - A :meth:`~pandas.DataFrame.query` method has been added that allows you to select elements of a ``DataFrame`` using a natural query syntax nearly identical to Python syntax. diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 36b1e2bc96090..c5971bab9a792 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -203,4 +203,10 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, eng = _engines[engine] eng_inst = eng(parsed_expr) ret = eng_inst.evaluate() + + # assign if needed + if parsed_expr.assignee is not None and parsed_expr.assigner is not None: + parsed_expr.assignee[parsed_expr.assigner] = ret + return None + return ret diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index ba2dffa9e71b8..8706d4ae5cfef 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -21,6 +21,7 @@ _arith_ops_syms, _unary_ops_syms, is_term) from pandas.computation.ops import _reductions, _mathops, _LOCAL_TAG from pandas.computation.ops import Op, BinOp, UnaryOp, Term, Constant, Div +from pandas.computation.ops import UndefinedVariableError def _ensure_scope(level=2, global_dict=None, local_dict=None, resolvers=None, @@ -417,6 +418,8 @@ def __init__(self, env, engine, parser, preparser=_preparse): self.engine = engine self.parser = parser self.preparser = preparser + self.assignee = None + self.assigner = None def visit(self, node, **kwargs): if isinstance(node, string_types): @@ -575,9 +578,39 @@ def visit_Slice(self, node, **kwargs): return slice(lower, upper, step) def visit_Assign(self, node, **kwargs): - cmpr = ast.Compare(ops=[ast.Eq()], left=node.targets[0], - comparators=[node.value]) - return self.visit(cmpr) + """ + support a single assignment node, like + + c = a + b + + set the assignee at the top level, must be a Name node which + might or might not exist in the resolvers + + """ + + if len(node.targets) != 1: + raise SyntaxError('can only assign a single expression') + if not isinstance(node.targets[0], ast.Name): + raise SyntaxError('left hand side of an assignment must be a single name') + + # we have no one to assign to + if not len(self.env.resolvers): + raise NotImplementedError + + try: + assigner = self.visit(node.targets[0], **kwargs) + except (UndefinedVariableError): + assigner = node.targets[0].id + + self.assigner = getattr(assigner,'name',assigner) + if self.assigner is None: + raise SyntaxError('left hand side of an assignment must be a single resolvable name') + try: + self.assignee = self.env.resolvers[0] + except: + raise ValueError('cannot create an assignee for this expression') + + return self.visit(node.value, **kwargs) def visit_Attribute(self, node, **kwargs): attr = node.attr @@ -669,7 +702,7 @@ def visitor(x, y): return reduce(visitor, operands) -_python_not_supported = frozenset(['Assign', 'Dict', 'Call', 'BoolOp', +_python_not_supported = frozenset(['Dict', 'Call', 'BoolOp', 'In', 'NotIn']) _numexpr_supported_calls = frozenset(_reductions + _mathops) @@ -712,6 +745,14 @@ def __init__(self, expr, engine='numexpr', parser='pandas', env=None, self.terms = self.parse() self.truediv = truediv + @property + def assigner(self): + return getattr(self._visitor,'assigner',None) + + @property + def assignee(self): + return getattr(self._visitor,'assignee',None) + def __call__(self): self.env.locals['truediv'] = self.truediv return self.terms(self.env) diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 9ffae5edd93bc..eb675d6230c8c 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -389,6 +389,11 @@ def visit_USub(self, node, **kwargs): def visit_Index(self, node, **kwargs): return self.visit(node.value).value + def visit_Assign(self, node, **kwargs): + cmpr = ast.Compare(ops=[ast.Eq()], left=node.targets[0], + comparators=[node.value]) + return self.visit(cmpr) + def visit_Subscript(self, node, **kwargs): value = self.visit(node.value) slobj = self.visit(node.slice) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index aa5c0cc5d50f6..004f858d671f1 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -24,6 +24,7 @@ from pandas.computation.ops import (_binary_ops_dict, _unary_ops_dict, _special_case_arith_ops_syms, _arith_ops_syms, _bool_ops_syms) +from pandas.computation.common import NameResolutionError import pandas.computation.expr as expr import pandas.util.testing as tm from pandas.util.testing import (assert_frame_equal, randbool, @@ -1154,6 +1155,45 @@ def test_assignment_fails(self): self.assertRaises(NotImplementedError, self.eval, expr1, local_dict={'df': df, 'df2': df2}) + def test_assignment_column(self): + df = DataFrame(np.random.randn(5, 2), columns=list('ab')) + orig_df = df.copy() + + # multiple assignees + self.assertRaises(SyntaxError, df.eval, 'd c = a + b') + + # invalid assignees + self.assertRaises(SyntaxError, df.eval, 'd,c = a + b') + self.assertRaises(SyntaxError, df.eval, 'Timestamp("20131001") = a + b') + + # single assignment - existing variable + expected = orig_df.copy() + expected['a'] = expected['a'] + expected['b'] + df = orig_df.copy() + df.eval('a = a + b') + assert_frame_equal(df,expected) + + # single assignment - new variable + expected = orig_df.copy() + expected['c'] = expected['a'] + expected['b'] + df = orig_df.copy() + df.eval('c = a + b') + assert_frame_equal(df,expected) + + # with a local name overlap + a = 1 + df = orig_df.copy() + df.eval('a = 1 + b') + expected = orig_df.copy() + expected['a'] = 1 + expected['b'] + assert_frame_equal(df,expected) + + df = orig_df.copy() + def f(): + a = 1 + df.eval('a=a+b') + self.assertRaises(NameResolutionError, f) + def test_basic_period_index_boolean_expression(self): df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') From 640d4c9fd7989f264faee010133890b3d21b9828 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 26 Oct 2013 20:05:51 -0400 Subject: [PATCH 2/2] DOC: update docs in enhancedperf.rst TST: addtional tests for multiple assignment, targets ENH: add target to Scope, use instead of resolvers --- doc/source/enhancingperf.rst | 13 ++++++++-- pandas/computation/eval.py | 11 ++++++--- pandas/computation/expr.py | 35 +++++++++++---------------- pandas/computation/tests/test_eval.py | 25 ++++++++++++++++--- pandas/core/frame.py | 2 ++ 5 files changed, 55 insertions(+), 31 deletions(-) diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index e59cb6ac30964..4e9e62a2f0e3e 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -441,18 +441,27 @@ The ``DataFrame.eval`` method (Experimental) In addition to the top level :func:`~pandas.eval` function you can also evaluate an expression in the "context" of a ``DataFrame``. - .. ipython:: python df = DataFrame(randn(5, 2), columns=['a', 'b']) df.eval('a + b') - Any expression that is a valid :func:`~pandas.eval` expression is also a valid ``DataFrame.eval`` expression, with the added benefit that *you don't have to prefix the name of the* ``DataFrame`` *to the column you're interested in evaluating*. +In addition, you can perform in-line assignment of columns within an expression. +This can allow for *formulaic evaluation*. Only a signle assignement is permitted. +It can be a new column name or an existing column name. It must be a string-like. + +.. ipython:: python + + df = DataFrame(dict(a = range(5), b = range(5,10))) + df.eval('c=a+b') + df.eval('d=a+b+c') + df.eval('a=1') + df Local Variables ~~~~~~~~~~~~~~~ diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index c5971bab9a792..163477b258e15 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -113,7 +113,8 @@ def _convert_expression(expr): def eval(expr, parser='pandas', engine='numexpr', truediv=True, - local_dict=None, global_dict=None, resolvers=None, level=2): + local_dict=None, global_dict=None, resolvers=None, level=2, + target=None): """Evaluate a Python expression as a string using various backends. The following arithmetic operations are supported: ``+``, ``-``, ``*``, @@ -169,6 +170,8 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, level : int, optional The number of prior stack frames to traverse and add to the current scope. Most users will **not** need to change this parameter. + target : a target object for assignment, optional, default is None + essentially this is a passed in resolver Returns ------- @@ -194,7 +197,7 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, # get our (possibly passed-in) scope env = _ensure_scope(global_dict=global_dict, local_dict=local_dict, - resolvers=resolvers, level=level) + resolvers=resolvers, level=level, target=target) parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, truediv=truediv) @@ -205,8 +208,8 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, ret = eng_inst.evaluate() # assign if needed - if parsed_expr.assignee is not None and parsed_expr.assigner is not None: - parsed_expr.assignee[parsed_expr.assigner] = ret + if env.target is not None and parsed_expr.assigner is not None: + env.target[parsed_expr.assigner] = ret return None return ret diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 8706d4ae5cfef..64bceee118fd1 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -25,10 +25,10 @@ def _ensure_scope(level=2, global_dict=None, local_dict=None, resolvers=None, - **kwargs): + target=None, **kwargs): """Ensure that we are grabbing the correct scope.""" return Scope(gbls=global_dict, lcls=local_dict, level=level, - resolvers=resolvers) + resolvers=resolvers, target=target) def _check_disjoint_resolver_names(resolver_keys, local_keys, global_keys): @@ -89,13 +89,14 @@ class Scope(StringMixin): resolver_keys : frozenset """ __slots__ = ('globals', 'locals', 'resolvers', '_global_resolvers', - 'resolver_keys', '_resolver', 'level', 'ntemps') + 'resolver_keys', '_resolver', 'level', 'ntemps', 'target') - def __init__(self, gbls=None, lcls=None, level=1, resolvers=None): + def __init__(self, gbls=None, lcls=None, level=1, resolvers=None, target=None): self.level = level self.resolvers = tuple(resolvers or []) self.globals = dict() self.locals = dict() + self.target = target self.ntemps = 1 # number of temporary variables in this scope if isinstance(lcls, Scope): @@ -103,6 +104,8 @@ def __init__(self, gbls=None, lcls=None, level=1, resolvers=None): self.locals.update(ld.locals.copy()) self.globals.update(ld.globals.copy()) self.resolvers += ld.resolvers + if ld.target is not None: + self.target = ld.target self.update(ld.level) frame = sys._getframe(level) @@ -131,9 +134,10 @@ def __init__(self, gbls=None, lcls=None, level=1, resolvers=None): def __unicode__(self): return com.pprint_thing("locals: {0}\nglobals: {0}\nresolvers: " - "{0}".format(list(self.locals.keys()), - list(self.globals.keys()), - list(self.resolver_keys))) + "{0}\ntarget: {0}".format(list(self.locals.keys()), + list(self.globals.keys()), + list(self.resolver_keys), + self.target)) def __getitem__(self, key): return self.resolve(key, globally=False) @@ -418,7 +422,6 @@ def __init__(self, env, engine, parser, preparser=_preparse): self.engine = engine self.parser = parser self.preparser = preparser - self.assignee = None self.assigner = None def visit(self, node, **kwargs): @@ -583,7 +586,7 @@ def visit_Assign(self, node, **kwargs): c = a + b - set the assignee at the top level, must be a Name node which + set the assigner at the top level, must be a Name node which might or might not exist in the resolvers """ @@ -592,10 +595,8 @@ def visit_Assign(self, node, **kwargs): raise SyntaxError('can only assign a single expression') if not isinstance(node.targets[0], ast.Name): raise SyntaxError('left hand side of an assignment must be a single name') - - # we have no one to assign to - if not len(self.env.resolvers): - raise NotImplementedError + if self.env.target is None: + raise ValueError('cannot assign without a target object') try: assigner = self.visit(node.targets[0], **kwargs) @@ -605,10 +606,6 @@ def visit_Assign(self, node, **kwargs): self.assigner = getattr(assigner,'name',assigner) if self.assigner is None: raise SyntaxError('left hand side of an assignment must be a single resolvable name') - try: - self.assignee = self.env.resolvers[0] - except: - raise ValueError('cannot create an assignee for this expression') return self.visit(node.value, **kwargs) @@ -749,10 +746,6 @@ def __init__(self, expr, engine='numexpr', parser='pandas', env=None, def assigner(self): return getattr(self._visitor,'assigner',None) - @property - def assignee(self): - return getattr(self._visitor,'assignee',None) - def __call__(self): self.env.locals['truediv'] = self.truediv return self.terms(self.env) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 004f858d671f1..b8de54ade31db 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -1152,10 +1152,11 @@ def test_assignment_fails(self): df = DataFrame(np.random.randn(5, 3), columns=list('abc')) df2 = DataFrame(np.random.randn(5, 3)) expr1 = 'df = df2' - self.assertRaises(NotImplementedError, self.eval, expr1, + self.assertRaises(ValueError, self.eval, expr1, local_dict={'df': df, 'df2': df2}) def test_assignment_column(self): + skip_if_no_ne('numexpr') df = DataFrame(np.random.randn(5, 2), columns=list('ab')) orig_df = df.copy() @@ -1181,9 +1182,13 @@ def test_assignment_column(self): assert_frame_equal(df,expected) # with a local name overlap - a = 1 - df = orig_df.copy() - df.eval('a = 1 + b') + def f(): + df = orig_df.copy() + a = 1 + df.eval('a = 1 + b') + return df + + df = f() expected = orig_df.copy() expected['a'] = 1 + expected['b'] assert_frame_equal(df,expected) @@ -1194,6 +1199,18 @@ def f(): df.eval('a=a+b') self.assertRaises(NameResolutionError, f) + # multiple assignment + df = orig_df.copy() + df.eval('c = a + b') + self.assertRaises(SyntaxError, df.eval, 'c = a = b') + + # explicit targets + df = orig_df.copy() + self.eval('c = df.a + df.b', local_dict={'df' : df}, target=df) + expected = orig_df.copy() + expected['c'] = expected['a'] + expected['b'] + assert_frame_equal(df,expected) + def test_basic_period_index_boolean_expression(self): df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b485d51514162..a91180ac43561 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1797,12 +1797,14 @@ def eval(self, expr, **kwargs): >>> from pandas import DataFrame >>> df = DataFrame(randn(10, 2), columns=list('ab')) >>> df.eval('a + b') + >>> df.eval('c=a + b') """ resolvers = kwargs.pop('resolvers', None) if resolvers is None: index_resolvers = self._get_resolvers() resolvers = [self, index_resolvers] kwargs['local_dict'] = _ensure_scope(resolvers=resolvers, **kwargs) + kwargs['target'] = self return _eval(expr, **kwargs) def _slice(self, slobj, axis=0, raise_on_error=False, typ=None):