From b12af6eef3ebcf4de4596ee8bcc2d642e1e19baf Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 31 Oct 2014 10:48:30 -0400 Subject: [PATCH] API/BUG: return np.nan rather than -1 for invalid datetime accessors values (GH8689) --- doc/source/whatsnew/v0.15.1.txt | 27 ++++++++++++++++++ pandas/tests/test_series.py | 23 +++++++++++++++ pandas/tseries/base.py | 25 +++++++++++++++++ pandas/tseries/index.py | 17 ++++++------ pandas/tseries/tdi.py | 13 ++------- pandas/tseries/tests/test_period.py | 6 ++-- pandas/tseries/tests/test_timedeltas.py | 5 +++- pandas/tseries/tests/test_timeseries.py | 37 +++++++++++++++++++++---- pandas/tslib.pyx | 16 +++++++---- 9 files changed, 136 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v0.15.1.txt b/doc/source/whatsnew/v0.15.1.txt index ab7f3992cf37b..25909f385e7da 100644 --- a/doc/source/whatsnew/v0.15.1.txt +++ b/doc/source/whatsnew/v0.15.1.txt @@ -121,6 +121,33 @@ API changes pd.concat(deque((df1, df2))) +- ``s.dt.hour`` and other ``.dt`` accessors will now return ``np.nan`` for missing values (rather than previously -1), (:issue:`8689`) + + .. ipython:: python + + s = Series(date_range('20130101',periods=5,freq='D')) + s.iloc[2] = np.nan + s + + previous behavior: + + .. code-block:: python + + In [6]: s.dt.hour + Out[6]: + 0 0 + 1 0 + 2 -1 + 3 0 + 4 0 + dtype: int64 + + current behavior: + + .. ipython:: python + + s.dt.hour + .. _whatsnew_0151.enhancements: Enhancements diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 68590e1597bbc..018d8c614eaae 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -208,6 +208,29 @@ def f(): s.dt.hour[0] = 5 self.assertRaises(com.SettingWithCopyError, f) + def test_valid_dt_with_missing_values(self): + + from datetime import date, time + + # GH 8689 + s = Series(date_range('20130101',periods=5,freq='D')) + s_orig = s.copy() + s.iloc[2] = pd.NaT + + for attr in ['microsecond','nanosecond','second','minute','hour','day']: + expected = getattr(s.dt,attr).copy() + expected.iloc[2] = np.nan + result = getattr(s.dt,attr) + tm.assert_series_equal(result, expected) + + result = s.dt.date + expected = Series([date(2013,1,1),date(2013,1,2),np.nan,date(2013,1,4),date(2013,1,5)],dtype='object') + tm.assert_series_equal(result, expected) + + result = s.dt.time + expected = Series([time(0),time(0),np.nan,time(0),time(0)],dtype='object') + tm.assert_series_equal(result, expected) + def test_binop_maybe_preserve_name(self): # names match, preserve diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 3e51b55821fba..0a446919e95d2 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -174,6 +174,31 @@ def asobject(self): from pandas.core.index import Index return Index(self._box_values(self.asi8), name=self.name, dtype=object) + def _maybe_mask_results(self, result, fill_value=None, convert=None): + """ + Parameters + ---------- + result : a ndarray + convert : string/dtype or None + + Returns + ------- + result : ndarray with values replace by the fill_value + + mask the result if needed, convert to the provided dtype if its not None + + This is an internal routine + """ + + if self.hasnans: + mask = self.asi8 == tslib.iNaT + if convert: + result = result.astype(convert) + if fill_value is None: + fill_value = np.nan + result[mask] = fill_value + return result + def tolist(self): """ return a list of the underlying data diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 4ab48b2db98d0..52ab217cbffc6 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -46,13 +46,17 @@ def f(self): utc = _utc() if self.tz is not utc: values = self._local_timestamps() + if field in ['is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'is_year_start', 'is_year_end']: month_kw = self.freq.kwds.get('startingMonth', self.freq.kwds.get('month', 12)) if self.freq else 12 - return tslib.get_start_end_field(values, field, self.freqstr, month_kw) + result = tslib.get_start_end_field(values, field, self.freqstr, month_kw) else: - return tslib.get_date_field(values, field) + result = tslib.get_date_field(values, field) + + return self._maybe_mask_results(result,convert='float64') + f.__name__ = name f.__doc__ = docstring return property(f) @@ -643,9 +647,7 @@ def _sub_datelike(self, other): other = Timestamp(other) i8 = self.asi8 result = i8 - other.value - if self.hasnans: - mask = i8 == tslib.iNaT - result[mask] = tslib.iNaT + result = self._maybe_mask_results(result,fill_value=tslib.iNaT) return TimedeltaIndex(result,name=self.name,copy=False) def _add_delta(self, delta): @@ -1329,15 +1331,14 @@ def time(self): """ # can't call self.map() which tries to treat func as ufunc # and causes recursion warnings on python 2.6 - return _algos.arrmap_object(self.asobject.values, lambda x: x.time()) + return self._maybe_mask_results(_algos.arrmap_object(self.asobject.values, lambda x: x.time())) @property def date(self): """ Returns numpy array of datetime.date. The date part of the Timestamps. """ - return _algos.arrmap_object(self.asobject.values, lambda x: x.date()) - + return self._maybe_mask_results(_algos.arrmap_object(self.asobject.values, lambda x: x.date())) def normalize(self): """ diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 2c452b2fa7ded..5a041ed09fb27 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -307,10 +307,7 @@ def _evaluate_with_timedelta_like(self, other, op, opstr): i8 = self.asi8 result = i8/float(other.value) - if self.hasnans: - mask = i8 == tslib.iNaT - result = result.astype('float64') - result[mask] = np.nan + result = self._maybe_mask_results(result,convert='float64') return Index(result,name=self.name,copy=False) raise TypeError("can only perform ops with timedelta like values") @@ -322,9 +319,7 @@ def _add_datelike(self, other): other = Timestamp(other) i8 = self.asi8 result = i8 + other.value - if self.hasnans: - mask = i8 == tslib.iNaT - result[mask] = tslib.iNaT + result = self._maybe_mask_results(result,fill_value=tslib.iNaT) return DatetimeIndex(result,name=self.name,copy=False) def _sub_datelike(self, other): @@ -455,9 +450,7 @@ def astype(self, dtype): # return an index (essentially this is division) result = self.values.astype(dtype) if self.hasnans: - result = result.astype('float64') - result[self.asi8 == tslib.iNaT] = np.nan - return Index(result,name=self.name) + return Index(self._maybe_mask_results(result,convert='float64'),name=self.name) return Index(result.astype('i8'),name=self.name) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index e6e6b48ccb573..e046d687435e7 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -500,11 +500,11 @@ def test_properties_nat(self): # confirm Period('NaT') work identical with Timestamp('NaT') for f in ['year', 'month', 'day', 'hour', 'minute', 'second', 'week', 'dayofyear', 'quarter']: - self.assertEqual(getattr(p_nat, f), -1) - self.assertEqual(getattr(t_nat, f), -1) + self.assertTrue(np.isnan(getattr(p_nat, f))) + self.assertTrue(np.isnan(getattr(t_nat, f))) for f in ['weekofyear', 'dayofweek', 'weekday', 'qyear']: - self.assertEqual(getattr(p_nat, f), -1) + self.assertTrue(np.isnan(getattr(p_nat, f))) def test_pnow(self): dt = datetime.now() diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 282301499dcbc..11bf22a055b8f 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1,7 +1,7 @@ # pylint: disable-msg=E1101,W0612 from __future__ import division -from datetime import datetime, timedelta +from datetime import datetime, timedelta, time import nose import numpy as np @@ -460,6 +460,9 @@ def testit(unit, transform): self.assertRaises(ValueError, lambda : to_timedelta([1,2],unit='foo')) self.assertRaises(ValueError, lambda : to_timedelta(1,unit='foo')) + # time not supported ATM + self.assertRaises(ValueError, lambda :to_timedelta(time(second=1))) + def test_to_timedelta_via_apply(self): # GH 5458 expected = Series([np.timedelta64(1,'s')]) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 1980924483bfb..bf1f0d31e8d3e 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -10,7 +10,7 @@ from pandas import (Index, Series, TimeSeries, DataFrame, isnull, date_range, Timestamp, Period, DatetimeIndex, - Int64Index, to_datetime, bdate_range, Float64Index) + Int64Index, to_datetime, bdate_range, Float64Index, TimedeltaIndex) import pandas.core.datetools as datetools import pandas.tseries.offsets as offsets @@ -939,9 +939,9 @@ def test_nat_vector_field_access(self): 'week', 'dayofyear'] for field in fields: result = getattr(idx, field) - expected = [getattr(x, field) if x is not NaT else -1 + expected = [getattr(x, field) if x is not NaT else np.nan for x in idx] - self.assert_numpy_array_equal(result, expected) + self.assert_numpy_array_equivalent(result, np.array(expected)) def test_nat_scalar_field_access(self): fields = ['year', 'quarter', 'month', 'day', 'hour', @@ -949,9 +949,9 @@ def test_nat_scalar_field_access(self): 'week', 'dayofyear'] for field in fields: result = getattr(NaT, field) - self.assertEqual(result, -1) + self.assertTrue(np.isnan(result)) - self.assertEqual(NaT.weekday(), -1) + self.assertTrue(np.isnan(NaT.weekday())) def test_to_datetime_types(self): @@ -3376,6 +3376,33 @@ def check(val,unit=None,h=1,s=1,us=0): result = Timestamp(NaT) self.assertIs(result, NaT) + def test_roundtrip(self): + + # test value to string and back conversions + # further test accessors + base = Timestamp('20140101 00:00:00') + + result = Timestamp(base.value + pd.Timedelta('5ms').value) + self.assertEqual(result,Timestamp(str(base) + ".005000")) + self.assertEqual(result.microsecond,5000) + + result = Timestamp(base.value + pd.Timedelta('5us').value) + self.assertEqual(result,Timestamp(str(base) + ".000005")) + self.assertEqual(result.microsecond,5) + + result = Timestamp(base.value + pd.Timedelta('5ns').value) + self.assertEqual(result,Timestamp(str(base) + ".000000005")) + self.assertEqual(result.nanosecond,5) + self.assertEqual(result.microsecond,0) + + result = Timestamp(base.value + pd.Timedelta('6ms 5us').value) + self.assertEqual(result,Timestamp(str(base) + ".006005")) + self.assertEqual(result.microsecond,5+6*1000) + + result = Timestamp(base.value + pd.Timedelta('200ms 5us').value) + self.assertEqual(result,Timestamp(str(base) + ".200005")) + self.assertEqual(result.microsecond,5+200*1000) + def test_comparison(self): # 5-18-2012 00:00:00.000 stamp = long(1337299200000000000) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index e88d88c86cf48..ffe94a94b15b5 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -289,8 +289,7 @@ class Timestamp(_Timestamp): result = '%.2d:%.2d:%.2d' % (self.hour, self.minute, self.second) if self.nanosecond != 0: - nanos = self.nanosecond + 1000 * self.microsecond - result += '.%.9d' % nanos + result += '.%.9d' % (self.nanosecond + 1000 * self.microsecond) elif self.microsecond != 0: result += '.%.6d' % self.microsecond @@ -345,6 +344,10 @@ class Timestamp(_Timestamp): weekofyear = week + @property + def microsecond(self): + return self._get_field('us') + @property def quarter(self): return self._get_field('q') @@ -546,7 +549,7 @@ class NaTType(_NaT): return NPY_NAT def weekday(self): - return -1 + return np.nan def toordinal(self): return -1 @@ -555,10 +558,10 @@ class NaTType(_NaT): return (__nat_unpickle, (None, )) fields = ['year', 'quarter', 'month', 'day', 'hour', - 'minute', 'second', 'microsecond', 'nanosecond', + 'minute', 'second', 'millisecond', 'microsecond', 'nanosecond', 'week', 'dayofyear'] for field in fields: - prop = property(fget=lambda self: -1) + prop = property(fget=lambda self: np.nan) setattr(NaTType, field, prop) def __nat_unpickle(*args): @@ -3002,6 +3005,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field): pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.us return out + elif field == 'ns': for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue @@ -3855,7 +3859,7 @@ def get_period_field(int code, int64_t value, int freq): if f is NULL: raise ValueError('Unrecognized period code: %d' % code) if value == iNaT: - return -1 + return np.nan return f(value, freq) def get_period_field_arr(int code, ndarray[int64_t] arr, int freq):