Skip to content

API: Improved inference of datetime/timedelta with mixed null objects. (GH7431) #7435

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 13, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions doc/source/v0.14.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,8 @@ API changes
day = offsets.Day(normalize=True)
day.apply(Timestamp('2014-01-01 09:00'))





- Improved inference of datetime/timedelta with mixed null objects. Regression from 0.13.1 in interpretation of an object Index
with all null elements (:issue:`7431`)

- Openpyxl now raises a ValueError on construction of the openpyxl writer
instead of warning on pandas import (:issue:`7284`).
Expand Down
91 changes: 74 additions & 17 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1782,24 +1782,81 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False):
value.dtype == np.object_)):
pass

# try to infer if we have a datetimelike here
# otherwise pass thru
else:
# we might have a array (or single object) that is datetime like,
# and no dtype is passed don't change the value unless we find a
# datetime set
v = value
if not is_list_like(v):
v = [v]
if len(v):
inferred_type = lib.infer_dtype(v)
if inferred_type in ['datetime', 'datetime64']:
try:
value = tslib.array_to_datetime(np.array(v))
except:
pass
elif inferred_type in ['timedelta', 'timedelta64']:
from pandas.tseries.timedeltas import \
_possibly_cast_to_timedelta
value = _possibly_cast_to_timedelta(value, coerce='compat')
value = _possibly_infer_to_datetimelike(value)

return value

def _possibly_infer_to_datetimelike(value):
# we might have a array (or single object) that is datetime like,
# and no dtype is passed don't change the value unless we find a
# datetime/timedelta set

# this is pretty strict in that a datetime/timedelta is REQUIRED
# in addition to possible nulls/string likes

# ONLY strings are NOT datetimelike

v = value
if not is_list_like(v):
v = [v]
if not isinstance(v, np.ndarray):
v = np.array(v)
shape = v.shape
if not v.ndim == 1:
v = v.ravel()

if len(v):

def _try_datetime(v):
# safe coerce to datetime64
try:
return tslib.array_to_datetime(v, raise_=True).reshape(shape)
except:
return v

def _try_timedelta(v):
# safe coerce to timedelta64

# will try first with a string & object conversion
from pandas.tseries.timedeltas import to_timedelta
try:
return to_timedelta(v).values.reshape(shape)
except:

# this is for compat with numpy < 1.7
# but string-likes will fail here

from pandas.tseries.timedeltas import \
_possibly_cast_to_timedelta
try:
return _possibly_cast_to_timedelta(v, coerce='compat').reshape(shape)
except:
return v

# do a quick inference for perf
sample = v[:min(3,len(v))]
inferred_type = lib.infer_dtype(sample)

if inferred_type in ['datetime', 'datetime64']:
value = _try_datetime(v)
elif inferred_type in ['timedelta', 'timedelta64']:
value = _try_timedelta(v)

# its possible to have nulls intermixed within the datetime or timedelta
# these will in general have an inferred_type of 'mixed', so have to try
# both datetime and timedelta

# try timedelta first to avoid spurious datetime conversions
# e.g. '00:00:01' is a timedelta but technically is also a datetime
elif inferred_type in ['mixed']:

if lib.is_possible_datetimelike_array(_ensure_object(v)):
value = _try_timedelta(v)
if lib.infer_dtype(value) in ['mixed']:
value = _try_datetime(v)

return value

Expand Down
57 changes: 35 additions & 22 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
_NS_DTYPE, _TD_DTYPE, ABCSeries, is_list_like,
ABCSparseSeries, _infer_dtype_from_scalar,
_is_null_datelike_scalar,
is_timedelta64_dtype, is_datetime64_dtype,)
is_timedelta64_dtype, is_datetime64_dtype,
_possibly_infer_to_datetimelike)
from pandas.core.index import Index, MultiIndex, _ensure_index
from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer)
import pandas.core.common as com
Expand Down Expand Up @@ -1807,26 +1808,21 @@ def make_block(values, placement, klass=None, ndim=None,
elif issubclass(vtype, np.complexfloating):
klass = ComplexBlock

# try to infer a DatetimeBlock, or set to an ObjectBlock
else:

# we want to infer here if its a datetimelike if its object type
# this is pretty strict in that it requires a datetime/timedelta
# value IN addition to possible nulls/strings
# an array of ONLY strings will not be inferred
if np.prod(values.shape):
flat = values.ravel()

# try with just the first element; we just need to see if
# this is a datetime or not
inferred_type = lib.infer_dtype(flat[0:1])
if inferred_type in ['datetime', 'datetime64']:

# we have an object array that has been inferred as
# datetime, so convert it
try:
values = tslib.array_to_datetime(
flat).reshape(values.shape)
if issubclass(values.dtype.type, np.datetime64):
klass = DatetimeBlock
except: # it already object, so leave it
pass
result = _possibly_infer_to_datetimelike(values)
vtype = result.dtype.type
if issubclass(vtype, np.datetime64):
klass = DatetimeBlock
values = result
elif (issubclass(vtype, np.timedelta64)):
klass = TimeDeltaBlock
values = result

if klass is None:
klass = ObjectBlock
Expand Down Expand Up @@ -2525,7 +2521,7 @@ def _consolidate_inplace(self):
self._known_consolidated = True
self._rebuild_blknos_and_blklocs()

def get(self, item):
def get(self, item, fastpath=True):
"""
Return values for selected item (ndarray or BlockManager).
"""
Expand All @@ -2543,7 +2539,7 @@ def get(self, item):
else:
raise ValueError("cannot label index with a null key")

return self.iget(loc)
return self.iget(loc, fastpath=fastpath)
else:

if isnull(item):
Expand All @@ -2553,8 +2549,25 @@ def get(self, item):
return self.reindex_indexer(new_axis=self.items[indexer],
indexer=indexer, axis=0, allow_dups=True)

def iget(self, i):
return self.blocks[self._blknos[i]].iget(self._blklocs[i])
def iget(self, i, fastpath=True):
"""
Return the data as a SingleBlockManager if fastpath=True and possible

Otherwise return as a ndarray

"""

block = self.blocks[self._blknos[i]]
values = block.iget(self._blklocs[i])
if not fastpath or block.is_sparse or values.ndim != 1:
return values

# fastpath shortcut for select a single-dim from a 2-dim BM
return SingleBlockManager([ block.make_block_same_class(values,
placement=slice(0, len(values)),
fastpath=True) ],
self.axes[1])


def get_scalar(self, tup):
"""
Expand Down
26 changes: 22 additions & 4 deletions pandas/io/tests/test_json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import os

import numpy as np

from pandas import Series, DataFrame, DatetimeIndex, Timestamp
import nose
from pandas import Series, DataFrame, DatetimeIndex, Timestamp, _np_version_under1p7
import pandas as pd
read_json = pd.read_json

Expand Down Expand Up @@ -600,11 +600,29 @@ def test_url(self):
for c in ['created_at', 'closed_at', 'updated_at']:
self.assertEqual(result[c].dtype, 'datetime64[ns]')

def test_default_handler(self):
def test_timedelta(self):
if _np_version_under1p7:
raise nose.SkipTest("numpy < 1.7")

from datetime import timedelta
converter = lambda x: pd.to_timedelta(x,unit='ms')

s = Series([timedelta(23), timedelta(seconds=5)])
self.assertEqual(s.dtype,'timedelta64[ns]')
assert_series_equal(s, pd.read_json(s.to_json(),typ='series').apply(converter))

frame = DataFrame([timedelta(23), timedelta(seconds=5)])
self.assertEqual(frame[0].dtype,'timedelta64[ns]')
assert_frame_equal(
frame, pd.read_json(frame.to_json()).apply(converter))

def test_default_handler(self):
from datetime import timedelta

frame = DataFrame([timedelta(23), timedelta(seconds=5), 42])
self.assertRaises(OverflowError, frame.to_json)
expected = DataFrame([str(timedelta(23)), str(timedelta(seconds=5))])

expected = DataFrame([str(timedelta(23)), str(timedelta(seconds=5)), 42])
assert_frame_equal(
expected, pd.read_json(frame.to_json(default_handler=str)))

Expand Down
76 changes: 60 additions & 16 deletions pandas/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,27 @@ def infer_dtype_list(list values):
pass


def is_possible_datetimelike_array(object arr):
# determine if we have a possible datetimelike (or null-like) array
cdef:
Py_ssize_t i, n = len(arr)
bint seen_timedelta = 0, seen_datetime = 0
object v

for i in range(n):
v = arr[i]
if util.is_string_object(v):
continue
elif util._checknull(v):
continue
elif is_datetime(v):
seen_datetime=1
elif is_timedelta(v):
seen_timedelta=1
else:
return False
return seen_datetime or seen_timedelta

cdef inline bint is_null_datetimelike(v):
# determine if we have a null for a timedelta/datetime (or integer versions)x
if util._checknull(v):
Expand Down Expand Up @@ -331,61 +352,84 @@ def is_unicode_array(ndarray values):


def is_datetime_array(ndarray[object] values):
cdef int i, n = len(values)
cdef int i, null_count = 0, n = len(values)
cdef object v
if n == 0:
return False

# return False for all nulls
for i in range(n):
v = values[i]
if not (is_datetime(v) or is_null_datetimelike(v)):
if is_null_datetimelike(v):
# we are a regular null
if util._checknull(v):
null_count += 1
elif not is_datetime(v):
return False
return True

return null_count != n

def is_datetime64_array(ndarray values):
cdef int i, n = len(values)
cdef int i, null_count = 0, n = len(values)
cdef object v
if n == 0:
return False

# return False for all nulls
for i in range(n):
v = values[i]
if not (util.is_datetime64_object(v) or is_null_datetimelike(v)):
if is_null_datetimelike(v):
# we are a regular null
if util._checknull(v):
null_count += 1
elif not util.is_datetime64_object(v):
return False
return True
return null_count != n

def is_timedelta_array(ndarray values):
cdef int i, n = len(values)
cdef int i, null_count = 0, n = len(values)
cdef object v
if n == 0:
return False
for i in range(n):
v = values[i]
if not (PyDelta_Check(v) or is_null_datetimelike(v)):
if is_null_datetimelike(v):
# we are a regular null
if util._checknull(v):
null_count += 1
elif not PyDelta_Check(v):
return False
return True
return null_count != n

def is_timedelta64_array(ndarray values):
cdef int i, n = len(values)
cdef int i, null_count = 0, n = len(values)
cdef object v
if n == 0:
return False
for i in range(n):
v = values[i]
if not (util.is_timedelta64_object(v) or is_null_datetimelike(v)):
if is_null_datetimelike(v):
# we are a regular null
if util._checknull(v):
null_count += 1
elif not util.is_timedelta64_object(v):
return False
return True
return null_count != n

def is_timedelta_or_timedelta64_array(ndarray values):
""" infer with timedeltas and/or nat/none """
cdef int i, n = len(values)
cdef int i, null_count = 0, n = len(values)
cdef object v
if n == 0:
return False
for i in range(n):
v = values[i]
if not (is_timedelta(v) or is_null_datetimelike(v)):
if is_null_datetimelike(v):
# we are a regular null
if util._checknull(v):
null_count += 1
elif not is_timedelta(v):
return False
return True
return null_count != n

def is_date_array(ndarray[object] values):
cdef int i, n = len(values)
Expand Down
Loading