Skip to content

Commit 818cf27

Browse files
committed
Merge pull request #7435 from jreback/inference
API: Improved inference of datetime/timedelta with mixed null objects. (GH7431)
2 parents 9f7d41b + cd613bf commit 818cf27

File tree

11 files changed

+275
-90
lines changed

11 files changed

+275
-90
lines changed

doc/source/v0.14.1.txt

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,8 @@ API changes
4646
day = offsets.Day(normalize=True)
4747
day.apply(Timestamp('2014-01-01 09:00'))
4848

49-
50-
51-
52-
49+
- Improved inference of datetime/timedelta with mixed null objects. Regression from 0.13.1 in interpretation of an object Index
50+
with all null elements (:issue:`7431`)
5351

5452
- Openpyxl now raises a ValueError on construction of the openpyxl writer
5553
instead of warning on pandas import (:issue:`7284`).

pandas/core/common.py

Lines changed: 74 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1782,24 +1782,81 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False):
17821782
value.dtype == np.object_)):
17831783
pass
17841784

1785+
# try to infer if we have a datetimelike here
1786+
# otherwise pass thru
17851787
else:
1786-
# we might have a array (or single object) that is datetime like,
1787-
# and no dtype is passed don't change the value unless we find a
1788-
# datetime set
1789-
v = value
1790-
if not is_list_like(v):
1791-
v = [v]
1792-
if len(v):
1793-
inferred_type = lib.infer_dtype(v)
1794-
if inferred_type in ['datetime', 'datetime64']:
1795-
try:
1796-
value = tslib.array_to_datetime(np.array(v))
1797-
except:
1798-
pass
1799-
elif inferred_type in ['timedelta', 'timedelta64']:
1800-
from pandas.tseries.timedeltas import \
1801-
_possibly_cast_to_timedelta
1802-
value = _possibly_cast_to_timedelta(value, coerce='compat')
1788+
value = _possibly_infer_to_datetimelike(value)
1789+
1790+
return value
1791+
1792+
def _possibly_infer_to_datetimelike(value):
1793+
# we might have a array (or single object) that is datetime like,
1794+
# and no dtype is passed don't change the value unless we find a
1795+
# datetime/timedelta set
1796+
1797+
# this is pretty strict in that a datetime/timedelta is REQUIRED
1798+
# in addition to possible nulls/string likes
1799+
1800+
# ONLY strings are NOT datetimelike
1801+
1802+
v = value
1803+
if not is_list_like(v):
1804+
v = [v]
1805+
if not isinstance(v, np.ndarray):
1806+
v = np.array(v)
1807+
shape = v.shape
1808+
if not v.ndim == 1:
1809+
v = v.ravel()
1810+
1811+
if len(v):
1812+
1813+
def _try_datetime(v):
1814+
# safe coerce to datetime64
1815+
try:
1816+
return tslib.array_to_datetime(v, raise_=True).reshape(shape)
1817+
except:
1818+
return v
1819+
1820+
def _try_timedelta(v):
1821+
# safe coerce to timedelta64
1822+
1823+
# will try first with a string & object conversion
1824+
from pandas.tseries.timedeltas import to_timedelta
1825+
try:
1826+
return to_timedelta(v).values.reshape(shape)
1827+
except:
1828+
1829+
# this is for compat with numpy < 1.7
1830+
# but string-likes will fail here
1831+
1832+
from pandas.tseries.timedeltas import \
1833+
_possibly_cast_to_timedelta
1834+
try:
1835+
return _possibly_cast_to_timedelta(v, coerce='compat').reshape(shape)
1836+
except:
1837+
return v
1838+
1839+
# do a quick inference for perf
1840+
sample = v[:min(3,len(v))]
1841+
inferred_type = lib.infer_dtype(sample)
1842+
1843+
if inferred_type in ['datetime', 'datetime64']:
1844+
value = _try_datetime(v)
1845+
elif inferred_type in ['timedelta', 'timedelta64']:
1846+
value = _try_timedelta(v)
1847+
1848+
# its possible to have nulls intermixed within the datetime or timedelta
1849+
# these will in general have an inferred_type of 'mixed', so have to try
1850+
# both datetime and timedelta
1851+
1852+
# try timedelta first to avoid spurious datetime conversions
1853+
# e.g. '00:00:01' is a timedelta but technically is also a datetime
1854+
elif inferred_type in ['mixed']:
1855+
1856+
if lib.is_possible_datetimelike_array(_ensure_object(v)):
1857+
value = _try_timedelta(v)
1858+
if lib.infer_dtype(value) in ['mixed']:
1859+
value = _try_datetime(v)
18031860

18041861
return value
18051862

pandas/core/internals.py

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
_NS_DTYPE, _TD_DTYPE, ABCSeries, is_list_like,
1313
ABCSparseSeries, _infer_dtype_from_scalar,
1414
_is_null_datelike_scalar,
15-
is_timedelta64_dtype, is_datetime64_dtype,)
15+
is_timedelta64_dtype, is_datetime64_dtype,
16+
_possibly_infer_to_datetimelike)
1617
from pandas.core.index import Index, MultiIndex, _ensure_index
1718
from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer)
1819
import pandas.core.common as com
@@ -1807,26 +1808,21 @@ def make_block(values, placement, klass=None, ndim=None,
18071808
elif issubclass(vtype, np.complexfloating):
18081809
klass = ComplexBlock
18091810

1810-
# try to infer a DatetimeBlock, or set to an ObjectBlock
18111811
else:
18121812

1813+
# we want to infer here if its a datetimelike if its object type
1814+
# this is pretty strict in that it requires a datetime/timedelta
1815+
# value IN addition to possible nulls/strings
1816+
# an array of ONLY strings will not be inferred
18131817
if np.prod(values.shape):
1814-
flat = values.ravel()
1815-
1816-
# try with just the first element; we just need to see if
1817-
# this is a datetime or not
1818-
inferred_type = lib.infer_dtype(flat[0:1])
1819-
if inferred_type in ['datetime', 'datetime64']:
1820-
1821-
# we have an object array that has been inferred as
1822-
# datetime, so convert it
1823-
try:
1824-
values = tslib.array_to_datetime(
1825-
flat).reshape(values.shape)
1826-
if issubclass(values.dtype.type, np.datetime64):
1827-
klass = DatetimeBlock
1828-
except: # it already object, so leave it
1829-
pass
1818+
result = _possibly_infer_to_datetimelike(values)
1819+
vtype = result.dtype.type
1820+
if issubclass(vtype, np.datetime64):
1821+
klass = DatetimeBlock
1822+
values = result
1823+
elif (issubclass(vtype, np.timedelta64)):
1824+
klass = TimeDeltaBlock
1825+
values = result
18301826

18311827
if klass is None:
18321828
klass = ObjectBlock
@@ -2525,7 +2521,7 @@ def _consolidate_inplace(self):
25252521
self._known_consolidated = True
25262522
self._rebuild_blknos_and_blklocs()
25272523

2528-
def get(self, item):
2524+
def get(self, item, fastpath=True):
25292525
"""
25302526
Return values for selected item (ndarray or BlockManager).
25312527
"""
@@ -2543,7 +2539,7 @@ def get(self, item):
25432539
else:
25442540
raise ValueError("cannot label index with a null key")
25452541

2546-
return self.iget(loc)
2542+
return self.iget(loc, fastpath=fastpath)
25472543
else:
25482544

25492545
if isnull(item):
@@ -2553,8 +2549,25 @@ def get(self, item):
25532549
return self.reindex_indexer(new_axis=self.items[indexer],
25542550
indexer=indexer, axis=0, allow_dups=True)
25552551

2556-
def iget(self, i):
2557-
return self.blocks[self._blknos[i]].iget(self._blklocs[i])
2552+
def iget(self, i, fastpath=True):
2553+
"""
2554+
Return the data as a SingleBlockManager if fastpath=True and possible
2555+
2556+
Otherwise return as a ndarray
2557+
2558+
"""
2559+
2560+
block = self.blocks[self._blknos[i]]
2561+
values = block.iget(self._blklocs[i])
2562+
if not fastpath or block.is_sparse or values.ndim != 1:
2563+
return values
2564+
2565+
# fastpath shortcut for select a single-dim from a 2-dim BM
2566+
return SingleBlockManager([ block.make_block_same_class(values,
2567+
placement=slice(0, len(values)),
2568+
fastpath=True) ],
2569+
self.axes[1])
2570+
25582571

25592572
def get_scalar(self, tup):
25602573
"""

pandas/io/tests/test_json/test_pandas.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
import os
55

66
import numpy as np
7-
8-
from pandas import Series, DataFrame, DatetimeIndex, Timestamp
7+
import nose
8+
from pandas import Series, DataFrame, DatetimeIndex, Timestamp, _np_version_under1p7
99
import pandas as pd
1010
read_json = pd.read_json
1111

@@ -600,11 +600,29 @@ def test_url(self):
600600
for c in ['created_at', 'closed_at', 'updated_at']:
601601
self.assertEqual(result[c].dtype, 'datetime64[ns]')
602602

603-
def test_default_handler(self):
603+
def test_timedelta(self):
604+
if _np_version_under1p7:
605+
raise nose.SkipTest("numpy < 1.7")
606+
604607
from datetime import timedelta
608+
converter = lambda x: pd.to_timedelta(x,unit='ms')
609+
610+
s = Series([timedelta(23), timedelta(seconds=5)])
611+
self.assertEqual(s.dtype,'timedelta64[ns]')
612+
assert_series_equal(s, pd.read_json(s.to_json(),typ='series').apply(converter))
613+
605614
frame = DataFrame([timedelta(23), timedelta(seconds=5)])
615+
self.assertEqual(frame[0].dtype,'timedelta64[ns]')
616+
assert_frame_equal(
617+
frame, pd.read_json(frame.to_json()).apply(converter))
618+
619+
def test_default_handler(self):
620+
from datetime import timedelta
621+
622+
frame = DataFrame([timedelta(23), timedelta(seconds=5), 42])
606623
self.assertRaises(OverflowError, frame.to_json)
607-
expected = DataFrame([str(timedelta(23)), str(timedelta(seconds=5))])
624+
625+
expected = DataFrame([str(timedelta(23)), str(timedelta(seconds=5)), 42])
608626
assert_frame_equal(
609627
expected, pd.read_json(frame.to_json(default_handler=str)))
610628

pandas/src/inference.pyx

Lines changed: 60 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,27 @@ def infer_dtype_list(list values):
172172
pass
173173

174174

175+
def is_possible_datetimelike_array(object arr):
176+
# determine if we have a possible datetimelike (or null-like) array
177+
cdef:
178+
Py_ssize_t i, n = len(arr)
179+
bint seen_timedelta = 0, seen_datetime = 0
180+
object v
181+
182+
for i in range(n):
183+
v = arr[i]
184+
if util.is_string_object(v):
185+
continue
186+
elif util._checknull(v):
187+
continue
188+
elif is_datetime(v):
189+
seen_datetime=1
190+
elif is_timedelta(v):
191+
seen_timedelta=1
192+
else:
193+
return False
194+
return seen_datetime or seen_timedelta
195+
175196
cdef inline bint is_null_datetimelike(v):
176197
# determine if we have a null for a timedelta/datetime (or integer versions)x
177198
if util._checknull(v):
@@ -331,61 +352,84 @@ def is_unicode_array(ndarray values):
331352

332353

333354
def is_datetime_array(ndarray[object] values):
334-
cdef int i, n = len(values)
355+
cdef int i, null_count = 0, n = len(values)
335356
cdef object v
336357
if n == 0:
337358
return False
359+
360+
# return False for all nulls
338361
for i in range(n):
339362
v = values[i]
340-
if not (is_datetime(v) or is_null_datetimelike(v)):
363+
if is_null_datetimelike(v):
364+
# we are a regular null
365+
if util._checknull(v):
366+
null_count += 1
367+
elif not is_datetime(v):
341368
return False
342-
return True
343-
369+
return null_count != n
344370

345371
def is_datetime64_array(ndarray values):
346-
cdef int i, n = len(values)
372+
cdef int i, null_count = 0, n = len(values)
347373
cdef object v
348374
if n == 0:
349375
return False
376+
377+
# return False for all nulls
350378
for i in range(n):
351379
v = values[i]
352-
if not (util.is_datetime64_object(v) or is_null_datetimelike(v)):
380+
if is_null_datetimelike(v):
381+
# we are a regular null
382+
if util._checknull(v):
383+
null_count += 1
384+
elif not util.is_datetime64_object(v):
353385
return False
354-
return True
386+
return null_count != n
355387

356388
def is_timedelta_array(ndarray values):
357-
cdef int i, n = len(values)
389+
cdef int i, null_count = 0, n = len(values)
358390
cdef object v
359391
if n == 0:
360392
return False
361393
for i in range(n):
362394
v = values[i]
363-
if not (PyDelta_Check(v) or is_null_datetimelike(v)):
395+
if is_null_datetimelike(v):
396+
# we are a regular null
397+
if util._checknull(v):
398+
null_count += 1
399+
elif not PyDelta_Check(v):
364400
return False
365-
return True
401+
return null_count != n
366402

367403
def is_timedelta64_array(ndarray values):
368-
cdef int i, n = len(values)
404+
cdef int i, null_count = 0, n = len(values)
369405
cdef object v
370406
if n == 0:
371407
return False
372408
for i in range(n):
373409
v = values[i]
374-
if not (util.is_timedelta64_object(v) or is_null_datetimelike(v)):
410+
if is_null_datetimelike(v):
411+
# we are a regular null
412+
if util._checknull(v):
413+
null_count += 1
414+
elif not util.is_timedelta64_object(v):
375415
return False
376-
return True
416+
return null_count != n
377417

378418
def is_timedelta_or_timedelta64_array(ndarray values):
379419
""" infer with timedeltas and/or nat/none """
380-
cdef int i, n = len(values)
420+
cdef int i, null_count = 0, n = len(values)
381421
cdef object v
382422
if n == 0:
383423
return False
384424
for i in range(n):
385425
v = values[i]
386-
if not (is_timedelta(v) or is_null_datetimelike(v)):
426+
if is_null_datetimelike(v):
427+
# we are a regular null
428+
if util._checknull(v):
429+
null_count += 1
430+
elif not is_timedelta(v):
387431
return False
388-
return True
432+
return null_count != n
389433

390434
def is_date_array(ndarray[object] values):
391435
cdef int i, n = len(values)

0 commit comments

Comments
 (0)