Skip to content

Commit fb124fd

Browse files
committed
Merge pull request #8695 from jreback/danger
API/BUG: return np.nan rather than -1 for invalid datetime accessors values (GH8689)
2 parents 9a66dbb + b12af6e commit fb124fd

File tree

9 files changed

+136
-33
lines changed

9 files changed

+136
-33
lines changed

doc/source/whatsnew/v0.15.1.txt

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,33 @@ API changes
121121

122122
pd.concat(deque((df1, df2)))
123123

124+
- ``s.dt.hour`` and other ``.dt`` accessors will now return ``np.nan`` for missing values (rather than previously -1), (:issue:`8689`)
125+
126+
.. ipython:: python
127+
128+
s = Series(date_range('20130101',periods=5,freq='D'))
129+
s.iloc[2] = np.nan
130+
s
131+
132+
previous behavior:
133+
134+
.. code-block:: python
135+
136+
In [6]: s.dt.hour
137+
Out[6]:
138+
0 0
139+
1 0
140+
2 -1
141+
3 0
142+
4 0
143+
dtype: int64
144+
145+
current behavior:
146+
147+
.. ipython:: python
148+
149+
s.dt.hour
150+
124151
.. _whatsnew_0151.enhancements:
125152

126153
Enhancements

pandas/tests/test_series.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,29 @@ def f():
208208
s.dt.hour[0] = 5
209209
self.assertRaises(com.SettingWithCopyError, f)
210210

211+
def test_valid_dt_with_missing_values(self):
212+
213+
from datetime import date, time
214+
215+
# GH 8689
216+
s = Series(date_range('20130101',periods=5,freq='D'))
217+
s_orig = s.copy()
218+
s.iloc[2] = pd.NaT
219+
220+
for attr in ['microsecond','nanosecond','second','minute','hour','day']:
221+
expected = getattr(s.dt,attr).copy()
222+
expected.iloc[2] = np.nan
223+
result = getattr(s.dt,attr)
224+
tm.assert_series_equal(result, expected)
225+
226+
result = s.dt.date
227+
expected = Series([date(2013,1,1),date(2013,1,2),np.nan,date(2013,1,4),date(2013,1,5)],dtype='object')
228+
tm.assert_series_equal(result, expected)
229+
230+
result = s.dt.time
231+
expected = Series([time(0),time(0),np.nan,time(0),time(0)],dtype='object')
232+
tm.assert_series_equal(result, expected)
233+
211234
def test_binop_maybe_preserve_name(self):
212235

213236
# names match, preserve

pandas/tseries/base.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,31 @@ def asobject(self):
174174
from pandas.core.index import Index
175175
return Index(self._box_values(self.asi8), name=self.name, dtype=object)
176176

177+
def _maybe_mask_results(self, result, fill_value=None, convert=None):
178+
"""
179+
Parameters
180+
----------
181+
result : a ndarray
182+
convert : string/dtype or None
183+
184+
Returns
185+
-------
186+
result : ndarray with values replace by the fill_value
187+
188+
mask the result if needed, convert to the provided dtype if its not None
189+
190+
This is an internal routine
191+
"""
192+
193+
if self.hasnans:
194+
mask = self.asi8 == tslib.iNaT
195+
if convert:
196+
result = result.astype(convert)
197+
if fill_value is None:
198+
fill_value = np.nan
199+
result[mask] = fill_value
200+
return result
201+
177202
def tolist(self):
178203
"""
179204
return a list of the underlying data

pandas/tseries/index.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,17 @@ def f(self):
4646
utc = _utc()
4747
if self.tz is not utc:
4848
values = self._local_timestamps()
49+
4950
if field in ['is_month_start', 'is_month_end',
5051
'is_quarter_start', 'is_quarter_end',
5152
'is_year_start', 'is_year_end']:
5253
month_kw = self.freq.kwds.get('startingMonth', self.freq.kwds.get('month', 12)) if self.freq else 12
53-
return tslib.get_start_end_field(values, field, self.freqstr, month_kw)
54+
result = tslib.get_start_end_field(values, field, self.freqstr, month_kw)
5455
else:
55-
return tslib.get_date_field(values, field)
56+
result = tslib.get_date_field(values, field)
57+
58+
return self._maybe_mask_results(result,convert='float64')
59+
5660
f.__name__ = name
5761
f.__doc__ = docstring
5862
return property(f)
@@ -643,9 +647,7 @@ def _sub_datelike(self, other):
643647
other = Timestamp(other)
644648
i8 = self.asi8
645649
result = i8 - other.value
646-
if self.hasnans:
647-
mask = i8 == tslib.iNaT
648-
result[mask] = tslib.iNaT
650+
result = self._maybe_mask_results(result,fill_value=tslib.iNaT)
649651
return TimedeltaIndex(result,name=self.name,copy=False)
650652

651653
def _add_delta(self, delta):
@@ -1329,15 +1331,14 @@ def time(self):
13291331
"""
13301332
# can't call self.map() which tries to treat func as ufunc
13311333
# and causes recursion warnings on python 2.6
1332-
return _algos.arrmap_object(self.asobject.values, lambda x: x.time())
1334+
return self._maybe_mask_results(_algos.arrmap_object(self.asobject.values, lambda x: x.time()))
13331335

13341336
@property
13351337
def date(self):
13361338
"""
13371339
Returns numpy array of datetime.date. The date part of the Timestamps.
13381340
"""
1339-
return _algos.arrmap_object(self.asobject.values, lambda x: x.date())
1340-
1341+
return self._maybe_mask_results(_algos.arrmap_object(self.asobject.values, lambda x: x.date()))
13411342

13421343
def normalize(self):
13431344
"""

pandas/tseries/tdi.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -307,10 +307,7 @@ def _evaluate_with_timedelta_like(self, other, op, opstr):
307307

308308
i8 = self.asi8
309309
result = i8/float(other.value)
310-
if self.hasnans:
311-
mask = i8 == tslib.iNaT
312-
result = result.astype('float64')
313-
result[mask] = np.nan
310+
result = self._maybe_mask_results(result,convert='float64')
314311
return Index(result,name=self.name,copy=False)
315312

316313
raise TypeError("can only perform ops with timedelta like values")
@@ -322,9 +319,7 @@ def _add_datelike(self, other):
322319
other = Timestamp(other)
323320
i8 = self.asi8
324321
result = i8 + other.value
325-
if self.hasnans:
326-
mask = i8 == tslib.iNaT
327-
result[mask] = tslib.iNaT
322+
result = self._maybe_mask_results(result,fill_value=tslib.iNaT)
328323
return DatetimeIndex(result,name=self.name,copy=False)
329324

330325
def _sub_datelike(self, other):
@@ -455,9 +450,7 @@ def astype(self, dtype):
455450
# return an index (essentially this is division)
456451
result = self.values.astype(dtype)
457452
if self.hasnans:
458-
result = result.astype('float64')
459-
result[self.asi8 == tslib.iNaT] = np.nan
460-
return Index(result,name=self.name)
453+
return Index(self._maybe_mask_results(result,convert='float64'),name=self.name)
461454

462455
return Index(result.astype('i8'),name=self.name)
463456

pandas/tseries/tests/test_period.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -500,11 +500,11 @@ def test_properties_nat(self):
500500
# confirm Period('NaT') work identical with Timestamp('NaT')
501501
for f in ['year', 'month', 'day', 'hour', 'minute', 'second',
502502
'week', 'dayofyear', 'quarter']:
503-
self.assertEqual(getattr(p_nat, f), -1)
504-
self.assertEqual(getattr(t_nat, f), -1)
503+
self.assertTrue(np.isnan(getattr(p_nat, f)))
504+
self.assertTrue(np.isnan(getattr(t_nat, f)))
505505

506506
for f in ['weekofyear', 'dayofweek', 'weekday', 'qyear']:
507-
self.assertEqual(getattr(p_nat, f), -1)
507+
self.assertTrue(np.isnan(getattr(p_nat, f)))
508508

509509
def test_pnow(self):
510510
dt = datetime.now()

pandas/tseries/tests/test_timedeltas.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# pylint: disable-msg=E1101,W0612
22

33
from __future__ import division
4-
from datetime import datetime, timedelta
4+
from datetime import datetime, timedelta, time
55
import nose
66

77
import numpy as np
@@ -460,6 +460,9 @@ def testit(unit, transform):
460460
self.assertRaises(ValueError, lambda : to_timedelta([1,2],unit='foo'))
461461
self.assertRaises(ValueError, lambda : to_timedelta(1,unit='foo'))
462462

463+
# time not supported ATM
464+
self.assertRaises(ValueError, lambda :to_timedelta(time(second=1)))
465+
463466
def test_to_timedelta_via_apply(self):
464467
# GH 5458
465468
expected = Series([np.timedelta64(1,'s')])

pandas/tseries/tests/test_timeseries.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from pandas import (Index, Series, TimeSeries, DataFrame,
1212
isnull, date_range, Timestamp, Period, DatetimeIndex,
13-
Int64Index, to_datetime, bdate_range, Float64Index)
13+
Int64Index, to_datetime, bdate_range, Float64Index, TimedeltaIndex)
1414

1515
import pandas.core.datetools as datetools
1616
import pandas.tseries.offsets as offsets
@@ -939,19 +939,19 @@ def test_nat_vector_field_access(self):
939939
'week', 'dayofyear']
940940
for field in fields:
941941
result = getattr(idx, field)
942-
expected = [getattr(x, field) if x is not NaT else -1
942+
expected = [getattr(x, field) if x is not NaT else np.nan
943943
for x in idx]
944-
self.assert_numpy_array_equal(result, expected)
944+
self.assert_numpy_array_equivalent(result, np.array(expected))
945945

946946
def test_nat_scalar_field_access(self):
947947
fields = ['year', 'quarter', 'month', 'day', 'hour',
948948
'minute', 'second', 'microsecond', 'nanosecond',
949949
'week', 'dayofyear']
950950
for field in fields:
951951
result = getattr(NaT, field)
952-
self.assertEqual(result, -1)
952+
self.assertTrue(np.isnan(result))
953953

954-
self.assertEqual(NaT.weekday(), -1)
954+
self.assertTrue(np.isnan(NaT.weekday()))
955955

956956
def test_to_datetime_types(self):
957957

@@ -3376,6 +3376,33 @@ def check(val,unit=None,h=1,s=1,us=0):
33763376
result = Timestamp(NaT)
33773377
self.assertIs(result, NaT)
33783378

3379+
def test_roundtrip(self):
3380+
3381+
# test value to string and back conversions
3382+
# further test accessors
3383+
base = Timestamp('20140101 00:00:00')
3384+
3385+
result = Timestamp(base.value + pd.Timedelta('5ms').value)
3386+
self.assertEqual(result,Timestamp(str(base) + ".005000"))
3387+
self.assertEqual(result.microsecond,5000)
3388+
3389+
result = Timestamp(base.value + pd.Timedelta('5us').value)
3390+
self.assertEqual(result,Timestamp(str(base) + ".000005"))
3391+
self.assertEqual(result.microsecond,5)
3392+
3393+
result = Timestamp(base.value + pd.Timedelta('5ns').value)
3394+
self.assertEqual(result,Timestamp(str(base) + ".000000005"))
3395+
self.assertEqual(result.nanosecond,5)
3396+
self.assertEqual(result.microsecond,0)
3397+
3398+
result = Timestamp(base.value + pd.Timedelta('6ms 5us').value)
3399+
self.assertEqual(result,Timestamp(str(base) + ".006005"))
3400+
self.assertEqual(result.microsecond,5+6*1000)
3401+
3402+
result = Timestamp(base.value + pd.Timedelta('200ms 5us').value)
3403+
self.assertEqual(result,Timestamp(str(base) + ".200005"))
3404+
self.assertEqual(result.microsecond,5+200*1000)
3405+
33793406
def test_comparison(self):
33803407
# 5-18-2012 00:00:00.000
33813408
stamp = long(1337299200000000000)

pandas/tslib.pyx

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -289,8 +289,7 @@ class Timestamp(_Timestamp):
289289
result = '%.2d:%.2d:%.2d' % (self.hour, self.minute, self.second)
290290

291291
if self.nanosecond != 0:
292-
nanos = self.nanosecond + 1000 * self.microsecond
293-
result += '.%.9d' % nanos
292+
result += '.%.9d' % (self.nanosecond + 1000 * self.microsecond)
294293
elif self.microsecond != 0:
295294
result += '.%.6d' % self.microsecond
296295

@@ -345,6 +344,10 @@ class Timestamp(_Timestamp):
345344

346345
weekofyear = week
347346

347+
@property
348+
def microsecond(self):
349+
return self._get_field('us')
350+
348351
@property
349352
def quarter(self):
350353
return self._get_field('q')
@@ -546,7 +549,7 @@ class NaTType(_NaT):
546549
return NPY_NAT
547550

548551
def weekday(self):
549-
return -1
552+
return np.nan
550553

551554
def toordinal(self):
552555
return -1
@@ -555,10 +558,10 @@ class NaTType(_NaT):
555558
return (__nat_unpickle, (None, ))
556559

557560
fields = ['year', 'quarter', 'month', 'day', 'hour',
558-
'minute', 'second', 'microsecond', 'nanosecond',
561+
'minute', 'second', 'millisecond', 'microsecond', 'nanosecond',
559562
'week', 'dayofyear']
560563
for field in fields:
561-
prop = property(fget=lambda self: -1)
564+
prop = property(fget=lambda self: np.nan)
562565
setattr(NaTType, field, prop)
563566

564567
def __nat_unpickle(*args):
@@ -3002,6 +3005,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field):
30023005
pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts)
30033006
out[i] = dts.us
30043007
return out
3008+
30053009
elif field == 'ns':
30063010
for i in range(count):
30073011
if dtindex[i] == NPY_NAT: out[i] = -1; continue
@@ -3855,7 +3859,7 @@ def get_period_field(int code, int64_t value, int freq):
38553859
if f is NULL:
38563860
raise ValueError('Unrecognized period code: %d' % code)
38573861
if value == iNaT:
3858-
return -1
3862+
return np.nan
38593863
return f(value, freq)
38603864

38613865
def get_period_field_arr(int code, ndarray[int64_t] arr, int freq):

0 commit comments

Comments
 (0)