Skip to content

API/BUG: return np.nan rather than -1 for invalid datetime accessors values (GH8689) #8695

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 31, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions doc/source/whatsnew/v0.15.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,33 @@ API changes

pd.concat(deque((df1, df2)))

- ``s.dt.hour`` and other ``.dt`` accessors will now return ``np.nan`` for missing values (rather than previously -1), (:issue:`8689`)

.. ipython:: python

s = Series(date_range('20130101',periods=5,freq='D'))
s.iloc[2] = np.nan
s

previous behavior:

.. code-block:: python

In [6]: s.dt.hour
Out[6]:
0 0
1 0
2 -1
3 0
4 0
dtype: int64

current behavior:

.. ipython:: python

s.dt.hour

.. _whatsnew_0151.enhancements:

Enhancements
Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,29 @@ def f():
s.dt.hour[0] = 5
self.assertRaises(com.SettingWithCopyError, f)

def test_valid_dt_with_missing_values(self):

from datetime import date, time

# GH 8689
s = Series(date_range('20130101',periods=5,freq='D'))
s_orig = s.copy()
s.iloc[2] = pd.NaT

for attr in ['microsecond','nanosecond','second','minute','hour','day']:
expected = getattr(s.dt,attr).copy()
expected.iloc[2] = np.nan
result = getattr(s.dt,attr)
tm.assert_series_equal(result, expected)

result = s.dt.date
expected = Series([date(2013,1,1),date(2013,1,2),np.nan,date(2013,1,4),date(2013,1,5)],dtype='object')
tm.assert_series_equal(result, expected)

result = s.dt.time
expected = Series([time(0),time(0),np.nan,time(0),time(0)],dtype='object')
tm.assert_series_equal(result, expected)

def test_binop_maybe_preserve_name(self):

# names match, preserve
Expand Down
25 changes: 25 additions & 0 deletions pandas/tseries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,31 @@ def asobject(self):
from pandas.core.index import Index
return Index(self._box_values(self.asi8), name=self.name, dtype=object)

def _maybe_mask_results(self, result, fill_value=None, convert=None):
"""
Parameters
----------
result : a ndarray
convert : string/dtype or None

Returns
-------
result : ndarray with values replace by the fill_value

mask the result if needed, convert to the provided dtype if its not None

This is an internal routine
"""

if self.hasnans:
mask = self.asi8 == tslib.iNaT
if convert:
result = result.astype(convert)
if fill_value is None:
fill_value = np.nan
result[mask] = fill_value
return result

def tolist(self):
"""
return a list of the underlying data
Expand Down
17 changes: 9 additions & 8 deletions pandas/tseries/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,17 @@ def f(self):
utc = _utc()
if self.tz is not utc:
values = self._local_timestamps()

if field in ['is_month_start', 'is_month_end',
'is_quarter_start', 'is_quarter_end',
'is_year_start', 'is_year_end']:
month_kw = self.freq.kwds.get('startingMonth', self.freq.kwds.get('month', 12)) if self.freq else 12
return tslib.get_start_end_field(values, field, self.freqstr, month_kw)
result = tslib.get_start_end_field(values, field, self.freqstr, month_kw)
else:
return tslib.get_date_field(values, field)
result = tslib.get_date_field(values, field)

return self._maybe_mask_results(result,convert='float64')

f.__name__ = name
f.__doc__ = docstring
return property(f)
Expand Down Expand Up @@ -643,9 +647,7 @@ def _sub_datelike(self, other):
other = Timestamp(other)
i8 = self.asi8
result = i8 - other.value
if self.hasnans:
mask = i8 == tslib.iNaT
result[mask] = tslib.iNaT
result = self._maybe_mask_results(result,fill_value=tslib.iNaT)
return TimedeltaIndex(result,name=self.name,copy=False)

def _add_delta(self, delta):
Expand Down Expand Up @@ -1329,15 +1331,14 @@ def time(self):
"""
# can't call self.map() which tries to treat func as ufunc
# and causes recursion warnings on python 2.6
return _algos.arrmap_object(self.asobject.values, lambda x: x.time())
return self._maybe_mask_results(_algos.arrmap_object(self.asobject.values, lambda x: x.time()))

@property
def date(self):
"""
Returns numpy array of datetime.date. The date part of the Timestamps.
"""
return _algos.arrmap_object(self.asobject.values, lambda x: x.date())

return self._maybe_mask_results(_algos.arrmap_object(self.asobject.values, lambda x: x.date()))

def normalize(self):
"""
Expand Down
13 changes: 3 additions & 10 deletions pandas/tseries/tdi.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,10 +307,7 @@ def _evaluate_with_timedelta_like(self, other, op, opstr):

i8 = self.asi8
result = i8/float(other.value)
if self.hasnans:
mask = i8 == tslib.iNaT
result = result.astype('float64')
result[mask] = np.nan
result = self._maybe_mask_results(result,convert='float64')
return Index(result,name=self.name,copy=False)

raise TypeError("can only perform ops with timedelta like values")
Expand All @@ -322,9 +319,7 @@ def _add_datelike(self, other):
other = Timestamp(other)
i8 = self.asi8
result = i8 + other.value
if self.hasnans:
mask = i8 == tslib.iNaT
result[mask] = tslib.iNaT
result = self._maybe_mask_results(result,fill_value=tslib.iNaT)
return DatetimeIndex(result,name=self.name,copy=False)

def _sub_datelike(self, other):
Expand Down Expand Up @@ -455,9 +450,7 @@ def astype(self, dtype):
# return an index (essentially this is division)
result = self.values.astype(dtype)
if self.hasnans:
result = result.astype('float64')
result[self.asi8 == tslib.iNaT] = np.nan
return Index(result,name=self.name)
return Index(self._maybe_mask_results(result,convert='float64'),name=self.name)

return Index(result.astype('i8'),name=self.name)

Expand Down
6 changes: 3 additions & 3 deletions pandas/tseries/tests/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,11 +500,11 @@ def test_properties_nat(self):
# confirm Period('NaT') work identical with Timestamp('NaT')
for f in ['year', 'month', 'day', 'hour', 'minute', 'second',
'week', 'dayofyear', 'quarter']:
self.assertEqual(getattr(p_nat, f), -1)
self.assertEqual(getattr(t_nat, f), -1)
self.assertTrue(np.isnan(getattr(p_nat, f)))
self.assertTrue(np.isnan(getattr(t_nat, f)))

for f in ['weekofyear', 'dayofweek', 'weekday', 'qyear']:
self.assertEqual(getattr(p_nat, f), -1)
self.assertTrue(np.isnan(getattr(p_nat, f)))

def test_pnow(self):
dt = datetime.now()
Expand Down
5 changes: 4 additions & 1 deletion pandas/tseries/tests/test_timedeltas.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# pylint: disable-msg=E1101,W0612

from __future__ import division
from datetime import datetime, timedelta
from datetime import datetime, timedelta, time
import nose

import numpy as np
Expand Down Expand Up @@ -460,6 +460,9 @@ def testit(unit, transform):
self.assertRaises(ValueError, lambda : to_timedelta([1,2],unit='foo'))
self.assertRaises(ValueError, lambda : to_timedelta(1,unit='foo'))

# time not supported ATM
self.assertRaises(ValueError, lambda :to_timedelta(time(second=1)))

def test_to_timedelta_via_apply(self):
# GH 5458
expected = Series([np.timedelta64(1,'s')])
Expand Down
37 changes: 32 additions & 5 deletions pandas/tseries/tests/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from pandas import (Index, Series, TimeSeries, DataFrame,
isnull, date_range, Timestamp, Period, DatetimeIndex,
Int64Index, to_datetime, bdate_range, Float64Index)
Int64Index, to_datetime, bdate_range, Float64Index, TimedeltaIndex)

import pandas.core.datetools as datetools
import pandas.tseries.offsets as offsets
Expand Down Expand Up @@ -939,19 +939,19 @@ def test_nat_vector_field_access(self):
'week', 'dayofyear']
for field in fields:
result = getattr(idx, field)
expected = [getattr(x, field) if x is not NaT else -1
expected = [getattr(x, field) if x is not NaT else np.nan
for x in idx]
self.assert_numpy_array_equal(result, expected)
self.assert_numpy_array_equivalent(result, np.array(expected))

def test_nat_scalar_field_access(self):
fields = ['year', 'quarter', 'month', 'day', 'hour',
'minute', 'second', 'microsecond', 'nanosecond',
'week', 'dayofyear']
for field in fields:
result = getattr(NaT, field)
self.assertEqual(result, -1)
self.assertTrue(np.isnan(result))

self.assertEqual(NaT.weekday(), -1)
self.assertTrue(np.isnan(NaT.weekday()))

def test_to_datetime_types(self):

Expand Down Expand Up @@ -3376,6 +3376,33 @@ def check(val,unit=None,h=1,s=1,us=0):
result = Timestamp(NaT)
self.assertIs(result, NaT)

def test_roundtrip(self):

# test value to string and back conversions
# further test accessors
base = Timestamp('20140101 00:00:00')

result = Timestamp(base.value + pd.Timedelta('5ms').value)
self.assertEqual(result,Timestamp(str(base) + ".005000"))
self.assertEqual(result.microsecond,5000)

result = Timestamp(base.value + pd.Timedelta('5us').value)
self.assertEqual(result,Timestamp(str(base) + ".000005"))
self.assertEqual(result.microsecond,5)

result = Timestamp(base.value + pd.Timedelta('5ns').value)
self.assertEqual(result,Timestamp(str(base) + ".000000005"))
self.assertEqual(result.nanosecond,5)
self.assertEqual(result.microsecond,0)

result = Timestamp(base.value + pd.Timedelta('6ms 5us').value)
self.assertEqual(result,Timestamp(str(base) + ".006005"))
self.assertEqual(result.microsecond,5+6*1000)

result = Timestamp(base.value + pd.Timedelta('200ms 5us').value)
self.assertEqual(result,Timestamp(str(base) + ".200005"))
self.assertEqual(result.microsecond,5+200*1000)

def test_comparison(self):
# 5-18-2012 00:00:00.000
stamp = long(1337299200000000000)
Expand Down
16 changes: 10 additions & 6 deletions pandas/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,7 @@ class Timestamp(_Timestamp):
result = '%.2d:%.2d:%.2d' % (self.hour, self.minute, self.second)

if self.nanosecond != 0:
nanos = self.nanosecond + 1000 * self.microsecond
result += '.%.9d' % nanos
result += '.%.9d' % (self.nanosecond + 1000 * self.microsecond)
elif self.microsecond != 0:
result += '.%.6d' % self.microsecond

Expand Down Expand Up @@ -345,6 +344,10 @@ class Timestamp(_Timestamp):

weekofyear = week

@property
def microsecond(self):
return self._get_field('us')

@property
def quarter(self):
return self._get_field('q')
Expand Down Expand Up @@ -546,7 +549,7 @@ class NaTType(_NaT):
return NPY_NAT

def weekday(self):
return -1
return np.nan

def toordinal(self):
return -1
Expand All @@ -555,10 +558,10 @@ class NaTType(_NaT):
return (__nat_unpickle, (None, ))

fields = ['year', 'quarter', 'month', 'day', 'hour',
'minute', 'second', 'microsecond', 'nanosecond',
'minute', 'second', 'millisecond', 'microsecond', 'nanosecond',
'week', 'dayofyear']
for field in fields:
prop = property(fget=lambda self: -1)
prop = property(fget=lambda self: np.nan)
setattr(NaTType, field, prop)

def __nat_unpickle(*args):
Expand Down Expand Up @@ -3002,6 +3005,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field):
pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts)
out[i] = dts.us
return out

elif field == 'ns':
for i in range(count):
if dtindex[i] == NPY_NAT: out[i] = -1; continue
Expand Down Expand Up @@ -3855,7 +3859,7 @@ def get_period_field(int code, int64_t value, int freq):
if f is NULL:
raise ValueError('Unrecognized period code: %d' % code)
if value == iNaT:
return -1
return np.nan
return f(value, freq)

def get_period_field_arr(int code, ndarray[int64_t] arr, int freq):
Expand Down