Skip to content

Commit a9dfc50

Browse files
committed
ENH: evaluate datetime ops in python space with eval
1 parent 03ac0bf commit a9dfc50

File tree

6 files changed

+149
-23
lines changed

6 files changed

+149
-23
lines changed

pandas/computation/align.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -112,13 +112,15 @@ def _align_core(terms):
112112
axes = biggest.axes
113113
naxes = len(axes)
114114

115-
for term in (terms[i] for i in term_index):
116-
for axis, items in enumerate(term.value.axes):
117-
if isinstance(term.value, pd.Series) and naxes > 1:
118-
ax, itm = naxes - 1, term.value.index
115+
for value in (terms[i].value for i in term_index):
116+
for axis, items in enumerate(value.axes):
117+
if isinstance(value, pd.Series) and naxes > 1:
118+
ax, itm = naxes - 1, value.index
119119
else:
120120
ax, itm = axis, items
121-
axes[ax] = axes[ax].join(itm, how='outer')
121+
# TODO: use is_ method when jtratner's PR is merged
122+
if axes[ax] is not itm:
123+
axes[ax] = axes[ax].join(itm, how='outer')
122124

123125
for i, ndim in compat.iteritems(ndims):
124126
for axis, items in zip(range(ndim), axes):
@@ -136,7 +138,7 @@ def _align_core(terms):
136138
warnings.warn("Alignment difference on axis {0} is larger"
137139
" than an order of magnitude on term {1!r}, "
138140
"by more than {2:.4g}; performance may suffer"
139-
"".format(axis, term.name, ordm),
141+
"".format(axis, terms[i].name, ordm),
140142
category=pd.io.common.PerformanceWarning)
141143

142144
if transpose:

pandas/computation/expr.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,8 @@ def f(cls):
379379
return f
380380

381381

382+
_date_kinds = frozenset(['datetime64', 'timestamp', 'datetime'])
383+
382384
@disallow(_unsupported_nodes)
383385
@add_ops(_op_classes)
384386
class BaseExprVisitor(ast.NodeVisitor):
@@ -493,8 +495,15 @@ def _possibly_evaluate_binop(self, op, op_class, lhs, rhs,
493495
maybe_eval_in_python=('==', '!=')):
494496
res = op(lhs, rhs)
495497

496-
# "in"/"not in" ops are always evaluated in python
498+
if (res.op in _cmp_ops_syms and
499+
lhs.kind in _date_kinds or rhs.kind in _date_kinds and
500+
self.engine != 'pytables'):
501+
# all date ops must be done in python bc numexpr doesn't work well
502+
# with NaT
503+
return self._possibly_eval(res, self.binary_ops)
504+
497505
if res.op in eval_in_python:
506+
# "in"/"not in" ops are always evaluated in python
498507
return self._possibly_eval(res, eval_in_python)
499508
elif (lhs.return_type == object or rhs.return_type == object and
500509
self.engine != 'pytables'):

pandas/computation/ops.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -162,23 +162,20 @@ def raw(self):
162162

163163
@property
164164
def kind(self):
165+
t = self.type
165166
try:
166-
return self.type.__name__
167+
res = t.__name__
167168
except AttributeError:
168-
return self.type.type.__name__
169+
res = t.type.__name__
170+
return res.lower()
169171

170172
@property
171173
def value(self):
172-
kind = self.kind.lower()
173-
if kind == 'datetime64':
174-
try:
175-
return self._value.asi8
176-
except AttributeError:
177-
return self._value.view('i8')
174+
kind = self.kind
175+
if kind == 'timestamp':
176+
return self._value.asm8
178177
elif kind == 'datetime':
179-
return pd.Timestamp(self._value)
180-
elif kind == 'timestamp':
181-
return self._value.asm8.view('i8')
178+
return np.datetime64(self._value)
182179
return self._value
183180

184181
@value.setter
@@ -248,6 +245,16 @@ def return_type(self):
248245
def isscalar(self):
249246
return all(operand.isscalar for operand in self.operands)
250247

248+
@property
249+
def kind(self):
250+
t = self.return_type
251+
252+
try:
253+
res = t.__name__
254+
except AttributeError:
255+
res = t.type.__name__
256+
return res.lower()
257+
251258

252259
def _in(x, y):
253260
"""Compute the vectorized membership of ``x in y`` if possible, otherwise

pandas/computation/tests/test_eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1003,7 +1003,7 @@ def check_performance_warning_for_poor_alignment(self, engine, parser):
10031003
expected = ("Alignment difference on axis {0} is larger"
10041004
" than an order of magnitude on term {1!r}, "
10051005
"by more than {2:.4g}; performance may suffer"
1006-
"".format(1, 's', np.log10(s.size - df.shape[1])))
1006+
"".format(1, 'df', np.log10(s.size - df.shape[1])))
10071007
assert_equal(msg, expected)
10081008

10091009
def test_performance_warning_for_poor_alignment(self):

pandas/core/frame.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1898,6 +1898,7 @@ def _get_index_resolvers(self, axis):
18981898
# index or columns
18991899
axis_index = getattr(self, axis)
19001900
d = dict()
1901+
prefix = axis[0]
19011902

19021903
for i, name in enumerate(axis_index.names):
19031904
if name is not None:
@@ -1906,15 +1907,19 @@ def _get_index_resolvers(self, axis):
19061907
# prefix with 'i' or 'c' depending on the input axis
19071908
# e.g., you must do ilevel_0 for the 0th level of an unnamed
19081909
# multiiindex
1909-
level_string = '{prefix}level_{i}'.format(prefix=axis[0], i=i)
1910-
key = level_string
1910+
key = '{prefix}level_{i}'.format(prefix=prefix, i=i)
19111911
level = i
19121912

19131913
d[key] = Series(axis_index.get_level_values(level).values,
1914-
index=axis_index, name=level)
1914+
index=axis_index, name=name)
19151915

19161916
# put the index/columns itself in the dict
1917-
d[axis] = axis_index
1917+
if isinstance(axis_index, MultiIndex):
1918+
dindex = axis_index
1919+
else:
1920+
dindex = axis_index.to_series()
1921+
1922+
d[axis] = dindex
19181923
return d
19191924

19201925
def query(self, expr, **kwargs):

pandas/tests/test_frame.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11446,6 +11446,58 @@ def test_date_query_method(self):
1144611446
expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
1144711447
assert_frame_equal(res, expec)
1144811448

11449+
def test_date_query_with_NaT(self):
11450+
engine, parser = self.engine, self.parser
11451+
n = 10
11452+
df = DataFrame(randn(n, 3))
11453+
df['dates1'] = date_range('1/1/2012', periods=n)
11454+
df['dates2'] = date_range('1/1/2013', periods=n)
11455+
df['dates3'] = date_range('1/1/2014', periods=n)
11456+
df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
11457+
df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT
11458+
res = df.query('dates1 < 20130101 < dates3', engine=engine,
11459+
parser=parser)
11460+
expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
11461+
assert_frame_equal(res, expec)
11462+
11463+
def test_date_index_query(self):
11464+
engine, parser = self.engine, self.parser
11465+
n = 10
11466+
df = DataFrame(randn(n, 3))
11467+
df['dates1'] = date_range('1/1/2012', periods=n)
11468+
df['dates3'] = date_range('1/1/2014', periods=n)
11469+
df.set_index('dates1', inplace=True, drop=True)
11470+
res = df.query('index < 20130101 < dates3', engine=engine,
11471+
parser=parser)
11472+
expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
11473+
assert_frame_equal(res, expec)
11474+
11475+
def test_date_index_query_with_NaT(self):
11476+
engine, parser = self.engine, self.parser
11477+
n = 10
11478+
df = DataFrame(randn(n, 3))
11479+
df['dates1'] = date_range('1/1/2012', periods=n)
11480+
df['dates3'] = date_range('1/1/2014', periods=n)
11481+
df.iloc[0, 0] = pd.NaT
11482+
df.set_index('dates1', inplace=True, drop=True)
11483+
res = df.query('index < 20130101 < dates3', engine=engine,
11484+
parser=parser)
11485+
expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
11486+
assert_frame_equal(res, expec)
11487+
11488+
def test_date_index_query_with_NaT_duplicates(self):
11489+
engine, parser = self.engine, self.parser
11490+
n = 10
11491+
d = {}
11492+
d['dates1'] = date_range('1/1/2012', periods=n)
11493+
d['dates3'] = date_range('1/1/2014', periods=n)
11494+
df = DataFrame(d)
11495+
df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
11496+
df.set_index('dates1', inplace=True, drop=True)
11497+
res = df.query('index < 20130101 < dates3', engine=engine, parser=parser)
11498+
expec = df[(df.index.to_series() < '20130101') & ('20130101' < df.dates3)]
11499+
assert_frame_equal(res, expec)
11500+
1144911501
def test_query_scope(self):
1145011502
engine, parser = self.engine, self.parser
1145111503
from pandas.computation.common import NameResolutionError
@@ -11608,6 +11660,57 @@ def test_date_query_method(self):
1160811660
expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
1160911661
assert_frame_equal(res, expec)
1161011662

11663+
def test_date_query_with_NaT(self):
11664+
engine, parser = self.engine, self.parser
11665+
n = 10
11666+
df = DataFrame(randn(n, 3))
11667+
df['dates1'] = date_range('1/1/2012', periods=n)
11668+
df['dates2'] = date_range('1/1/2013', periods=n)
11669+
df['dates3'] = date_range('1/1/2014', periods=n)
11670+
df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
11671+
df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT
11672+
res = df.query('(dates1 < 20130101) & (20130101 < dates3)',
11673+
engine=engine, parser=parser)
11674+
expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
11675+
assert_frame_equal(res, expec)
11676+
11677+
def test_date_index_query(self):
11678+
engine, parser = self.engine, self.parser
11679+
n = 10
11680+
df = DataFrame(randn(n, 3))
11681+
df['dates1'] = date_range('1/1/2012', periods=n)
11682+
df['dates3'] = date_range('1/1/2014', periods=n)
11683+
df.set_index('dates1', inplace=True, drop=True)
11684+
res = df.query('(index < 20130101) & (20130101 < dates3)',
11685+
engine=engine, parser=parser)
11686+
expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
11687+
assert_frame_equal(res, expec)
11688+
11689+
def test_date_index_query_with_NaT(self):
11690+
engine, parser = self.engine, self.parser
11691+
n = 10
11692+
df = DataFrame(randn(n, 3))
11693+
df['dates1'] = date_range('1/1/2012', periods=n)
11694+
df['dates3'] = date_range('1/1/2014', periods=n)
11695+
df.iloc[0, 0] = pd.NaT
11696+
df.set_index('dates1', inplace=True, drop=True)
11697+
res = df.query('(index < 20130101) & (20130101 < dates3)',
11698+
engine=engine, parser=parser)
11699+
expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
11700+
assert_frame_equal(res, expec)
11701+
11702+
def test_date_index_query_with_NaT_duplicates(self):
11703+
engine, parser = self.engine, self.parser
11704+
n = 10
11705+
df = DataFrame(randn(n, 3))
11706+
df['dates1'] = date_range('1/1/2012', periods=n)
11707+
df['dates3'] = date_range('1/1/2014', periods=n)
11708+
df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
11709+
df.set_index('dates1', inplace=True, drop=True)
11710+
with tm.assertRaises(NotImplementedError):
11711+
res = df.query('index < 20130101 < dates3', engine=engine,
11712+
parser=parser)
11713+
1161111714
def test_nested_scope(self):
1161211715
engine = self.engine
1161311716
parser = self.parser

0 commit comments

Comments
 (0)