ENH: evaluate datetime ops in python space with eval

cpcloud · cpcloud · commit a9dfc5077c89 · 2013-09-27T13:36:16.000-04:00
diff --git a/pandas/computation/align.py b/pandas/computation/align.py
@@ -112,13 +112,15 @@ def _align_core(terms):
     axes = biggest.axes
     naxes = len(axes)
 
-    for term in (terms[i] for i in term_index):
-        for axis, items in enumerate(term.value.axes):
-            if isinstance(term.value, pd.Series) and naxes > 1:
-                ax, itm = naxes - 1, term.value.index
+    for value in (terms[i].value for i in term_index):
+        for axis, items in enumerate(value.axes):
+            if isinstance(value, pd.Series) and naxes > 1:
+                ax, itm = naxes - 1, value.index
             else:
                 ax, itm = axis, items
-            axes[ax] = axes[ax].join(itm, how='outer')
+            # TODO: use is_ method when jtratner's PR is merged
+            if axes[ax] is not itm:
+                axes[ax] = axes[ax].join(itm, how='outer')
 
     for i, ndim in compat.iteritems(ndims):
         for axis, items in zip(range(ndim), axes):
@@ -136,7 +138,7 @@ def _align_core(terms):
                     warnings.warn("Alignment difference on axis {0} is larger"
                                   " than an order of magnitude on term {1!r}, "
                                   "by more than {2:.4g}; performance may suffer"
-                                  "".format(axis, term.name, ordm),
+                                  "".format(axis, terms[i].name, ordm),
                                   category=pd.io.common.PerformanceWarning)
 
                 if transpose:
diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py
@@ -379,6 +379,8 @@ def f(cls):
     return f
 
 
+_date_kinds = frozenset(['datetime64', 'timestamp', 'datetime'])
+
 @disallow(_unsupported_nodes)
 @add_ops(_op_classes)
 class BaseExprVisitor(ast.NodeVisitor):
@@ -493,8 +495,15 @@ def _possibly_evaluate_binop(self, op, op_class, lhs, rhs,
                                  maybe_eval_in_python=('==', '!=')):
         res = op(lhs, rhs)
 
-        # "in"/"not in" ops are always evaluated in python
+        if (res.op in _cmp_ops_syms and
+            lhs.kind in _date_kinds or rhs.kind in _date_kinds and
+            self.engine != 'pytables'):
+            # all date ops must be done in python bc numexpr doesn't work well
+            # with NaT
+            return self._possibly_eval(res, self.binary_ops)
+
         if res.op in eval_in_python:
+            # "in"/"not in" ops are always evaluated in python
             return self._possibly_eval(res, eval_in_python)
         elif (lhs.return_type == object or rhs.return_type == object and
               self.engine != 'pytables'):
diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py
@@ -162,23 +162,20 @@ def raw(self):
 
     @property
     def kind(self):
+        t = self.type
         try:
-            return self.type.__name__
+            res = t.__name__
         except AttributeError:
-            return self.type.type.__name__
+            res = t.type.__name__
+        return res.lower()
 
     @property
     def value(self):
-        kind = self.kind.lower()
-        if kind == 'datetime64':
-            try:
-                return self._value.asi8
-            except AttributeError:
-                return self._value.view('i8')
+        kind = self.kind
+        if kind == 'timestamp':
+            return self._value.asm8
         elif kind == 'datetime':
-            return pd.Timestamp(self._value)
-        elif kind == 'timestamp':
-            return self._value.asm8.view('i8')
+            return np.datetime64(self._value)
         return self._value
 
     @value.setter
@@ -248,6 +245,16 @@ def return_type(self):
     def isscalar(self):
         return all(operand.isscalar for operand in self.operands)
 
+    @property
+    def kind(self):
+        t = self.return_type
+
+        try:
+            res = t.__name__
+        except AttributeError:
+            res = t.type.__name__
+        return res.lower()
+
 
 def _in(x, y):
     """Compute the vectorized membership of ``x in y`` if possible, otherwise
diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py
@@ -1003,7 +1003,7 @@ def check_performance_warning_for_poor_alignment(self, engine, parser):
                 expected = ("Alignment difference on axis {0} is larger"
                             " than an order of magnitude on term {1!r}, "
                             "by more than {2:.4g}; performance may suffer"
-                            "".format(1, 's', np.log10(s.size - df.shape[1])))
+                            "".format(1, 'df', np.log10(s.size - df.shape[1])))
                 assert_equal(msg, expected)
 
     def test_performance_warning_for_poor_alignment(self):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1898,6 +1898,7 @@ def _get_index_resolvers(self, axis):
         # index or columns
         axis_index = getattr(self, axis)
         d = dict()
+        prefix = axis[0]
 
         for i, name in enumerate(axis_index.names):
             if name is not None:
@@ -1906,15 +1907,19 @@ def _get_index_resolvers(self, axis):
                 # prefix with 'i' or 'c' depending on the input axis
                 # e.g., you must do ilevel_0 for the 0th level of an unnamed
                 # multiiindex
-                level_string = '{prefix}level_{i}'.format(prefix=axis[0], i=i)
-                key = level_string
+                key = '{prefix}level_{i}'.format(prefix=prefix, i=i)
                 level = i
 
             d[key] = Series(axis_index.get_level_values(level).values,
-                            index=axis_index, name=level)
+                            index=axis_index, name=name)
 
         # put the index/columns itself in the dict
-        d[axis] = axis_index
+        if isinstance(axis_index, MultiIndex):
+            dindex = axis_index
+        else:
+            dindex = axis_index.to_series()
+
+        d[axis] = dindex
         return d
 
     def query(self, expr, **kwargs):
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -11446,6 +11446,58 @@ def test_date_query_method(self):
         expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
         assert_frame_equal(res, expec)
 
+    def test_date_query_with_NaT(self):
+        engine, parser = self.engine, self.parser
+        n = 10
+        df = DataFrame(randn(n, 3))
+        df['dates1'] = date_range('1/1/2012', periods=n)
+        df['dates2'] = date_range('1/1/2013', periods=n)
+        df['dates3'] = date_range('1/1/2014', periods=n)
+        df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
+        df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT
+        res = df.query('dates1 < 20130101 < dates3', engine=engine,
+                       parser=parser)
+        expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
+        assert_frame_equal(res, expec)
+
+    def test_date_index_query(self):
+        engine, parser = self.engine, self.parser
+        n = 10
+        df = DataFrame(randn(n, 3))
+        df['dates1'] = date_range('1/1/2012', periods=n)
+        df['dates3'] = date_range('1/1/2014', periods=n)
+        df.set_index('dates1', inplace=True, drop=True)
+        res = df.query('index < 20130101 < dates3', engine=engine,
+                       parser=parser)
+        expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
+        assert_frame_equal(res, expec)
+
+    def test_date_index_query_with_NaT(self):
+        engine, parser = self.engine, self.parser
+        n = 10
+        df = DataFrame(randn(n, 3))
+        df['dates1'] = date_range('1/1/2012', periods=n)
+        df['dates3'] = date_range('1/1/2014', periods=n)
+        df.iloc[0, 0] = pd.NaT
+        df.set_index('dates1', inplace=True, drop=True)
+        res = df.query('index < 20130101 < dates3', engine=engine,
+                       parser=parser)
+        expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
+        assert_frame_equal(res, expec)
+
+    def test_date_index_query_with_NaT_duplicates(self):
+        engine, parser = self.engine, self.parser
+        n = 10
+        d = {}
+        d['dates1'] = date_range('1/1/2012', periods=n)
+        d['dates3'] = date_range('1/1/2014', periods=n)
+        df = DataFrame(d)
+        df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
+        df.set_index('dates1', inplace=True, drop=True)
+        res = df.query('index < 20130101 < dates3', engine=engine, parser=parser)
+        expec = df[(df.index.to_series() < '20130101') & ('20130101' < df.dates3)]
+        assert_frame_equal(res, expec)
+
     def test_query_scope(self):
         engine, parser = self.engine, self.parser
         from pandas.computation.common import NameResolutionError
@@ -11608,6 +11660,57 @@ def test_date_query_method(self):
         expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
         assert_frame_equal(res, expec)
 
+    def test_date_query_with_NaT(self):
+        engine, parser = self.engine, self.parser
+        n = 10
+        df = DataFrame(randn(n, 3))
+        df['dates1'] = date_range('1/1/2012', periods=n)
+        df['dates2'] = date_range('1/1/2013', periods=n)
+        df['dates3'] = date_range('1/1/2014', periods=n)
+        df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
+        df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT
+        res = df.query('(dates1 < 20130101) & (20130101 < dates3)',
+                       engine=engine, parser=parser)
+        expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
+        assert_frame_equal(res, expec)
+
+    def test_date_index_query(self):
+        engine, parser = self.engine, self.parser
+        n = 10
+        df = DataFrame(randn(n, 3))
+        df['dates1'] = date_range('1/1/2012', periods=n)
+        df['dates3'] = date_range('1/1/2014', periods=n)
+        df.set_index('dates1', inplace=True, drop=True)
+        res = df.query('(index < 20130101) & (20130101 < dates3)',
+                       engine=engine, parser=parser)
+        expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
+        assert_frame_equal(res, expec)
+
+    def test_date_index_query_with_NaT(self):
+        engine, parser = self.engine, self.parser
+        n = 10
+        df = DataFrame(randn(n, 3))
+        df['dates1'] = date_range('1/1/2012', periods=n)
+        df['dates3'] = date_range('1/1/2014', periods=n)
+        df.iloc[0, 0] = pd.NaT
+        df.set_index('dates1', inplace=True, drop=True)
+        res = df.query('(index < 20130101) & (20130101 < dates3)',
+                       engine=engine, parser=parser)
+        expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
+        assert_frame_equal(res, expec)
+
+    def test_date_index_query_with_NaT_duplicates(self):
+        engine, parser = self.engine, self.parser
+        n = 10
+        df = DataFrame(randn(n, 3))
+        df['dates1'] = date_range('1/1/2012', periods=n)
+        df['dates3'] = date_range('1/1/2014', periods=n)
+        df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
+        df.set_index('dates1', inplace=True, drop=True)
+        with tm.assertRaises(NotImplementedError):
+            res = df.query('index < 20130101 < dates3', engine=engine,
+                           parser=parser)
+
     def test_nested_scope(self):
         engine = self.engine
         parser = self.parser