pandas-dev · jreback · Jun 13, 2014 · Jun 11, 2014 · Jun 12, 2014
diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt
@@ -46,10 +46,8 @@ API changes
    day = offsets.Day(normalize=True)
    day.apply(Timestamp('2014-01-01 09:00'))
 
-
-
-
-
+- Improved inference of datetime/timedelta with mixed null objects. Regression from 0.13.1 in interpretation of an object Index
+  with all null elements (:issue:`7431`)
 
 - Openpyxl now raises a ValueError on construction of the openpyxl writer
   instead of warning on pandas import (:issue:`7284`).

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -1782,24 +1782,81 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False):
             value.dtype == np.object_)):
             pass
 
+        # try to infer if we have a datetimelike here
+        # otherwise pass thru
         else:
-            # we might have a array (or single object) that is datetime like,
-            # and no dtype is passed don't change the value unless we find a
-            # datetime set
-            v = value
-            if not is_list_like(v):
-                v = [v]
-            if len(v):
-                inferred_type = lib.infer_dtype(v)
-                if inferred_type in ['datetime', 'datetime64']:
-                    try:
-                        value = tslib.array_to_datetime(np.array(v))
-                    except:
-                        pass
-                elif inferred_type in ['timedelta', 'timedelta64']:
-                    from pandas.tseries.timedeltas import \
-                        _possibly_cast_to_timedelta
-                    value = _possibly_cast_to_timedelta(value, coerce='compat')
+            value = _possibly_infer_to_datetimelike(value)
+
+    return value
+
+def _possibly_infer_to_datetimelike(value):
+    # we might have a array (or single object) that is datetime like,
+    # and no dtype is passed don't change the value unless we find a
+    # datetime/timedelta set
+
+    # this is pretty strict in that a datetime/timedelta is REQUIRED
+    # in addition to possible nulls/string likes
+
+    # ONLY strings are NOT datetimelike
+
+    v = value
+    if not is_list_like(v):
+        v = [v]
+    if not isinstance(v, np.ndarray):
+        v = np.array(v)
+    shape = v.shape
+    if not v.ndim == 1:
+        v = v.ravel()
+
+    if len(v):
+
+        def _try_datetime(v):
+            # safe coerce to datetime64
+            try:
+                return tslib.array_to_datetime(v, raise_=True).reshape(shape)
+            except:
+                return v
+
+        def _try_timedelta(v):
+            # safe coerce to timedelta64
+
+            # will try first with a string & object conversion
+            from pandas.tseries.timedeltas import to_timedelta
+            try:
+                return to_timedelta(v).values.reshape(shape)
+            except:
+
+                # this is for compat with numpy < 1.7
+                # but string-likes will fail here
+
+                from pandas.tseries.timedeltas import \
+                     _possibly_cast_to_timedelta
+                try:
+                    return _possibly_cast_to_timedelta(v, coerce='compat').reshape(shape)
+                except:
+                    return v
+
+        # do a quick inference for perf
+        sample = v[:min(3,len(v))]
+        inferred_type = lib.infer_dtype(sample)
+
+        if inferred_type in ['datetime', 'datetime64']:
+            value = _try_datetime(v)
+        elif inferred_type in ['timedelta', 'timedelta64']:
+            value = _try_timedelta(v)
+
+        # its possible to have nulls intermixed within the datetime or timedelta
+        # these will in general have an inferred_type of 'mixed', so have to try
+        # both datetime and timedelta
+
+        # try timedelta first to avoid spurious datetime conversions
+        # e.g. '00:00:01' is a timedelta but technically is also a datetime
+        elif inferred_type in ['mixed']:
+
+            if lib.is_possible_datetimelike_array(_ensure_object(v)):
+                value = _try_timedelta(v)
+                if lib.infer_dtype(value) in ['mixed']:
+                    value = _try_datetime(v)
 
     return value
 

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -12,7 +12,8 @@
                                 _NS_DTYPE, _TD_DTYPE, ABCSeries, is_list_like,
                                 ABCSparseSeries, _infer_dtype_from_scalar,
                                 _is_null_datelike_scalar,
-                                is_timedelta64_dtype, is_datetime64_dtype,)
+                                is_timedelta64_dtype, is_datetime64_dtype,
+                                _possibly_infer_to_datetimelike)
 from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer)
 import pandas.core.common as com
@@ -1807,26 +1808,21 @@ def make_block(values, placement, klass=None, ndim=None,
         elif issubclass(vtype, np.complexfloating):
             klass = ComplexBlock
 
-        # try to infer a DatetimeBlock, or set to an ObjectBlock
         else:
 
+            # we want to infer here if its a datetimelike if its object type
+            # this is pretty strict in that it requires a datetime/timedelta
+            # value IN addition to possible nulls/strings
+            # an array of ONLY strings will not be inferred
             if np.prod(values.shape):
-                flat = values.ravel()
-
-                # try with just the first element; we just need to see if
-                # this is a datetime or not
-                inferred_type = lib.infer_dtype(flat[0:1])
-                if inferred_type in ['datetime', 'datetime64']:
-
-                    # we have an object array that has been inferred as
-                    # datetime, so convert it
-                    try:
-                        values = tslib.array_to_datetime(
-                            flat).reshape(values.shape)
-                        if issubclass(values.dtype.type, np.datetime64):
-                            klass = DatetimeBlock
-                    except:  # it already object, so leave it
-                        pass
+                result = _possibly_infer_to_datetimelike(values)
+                vtype = result.dtype.type
+                if issubclass(vtype, np.datetime64):
+                    klass = DatetimeBlock
+                    values = result
+                elif (issubclass(vtype, np.timedelta64)):
+                    klass = TimeDeltaBlock
+                    values = result
 
             if klass is None:
                 klass = ObjectBlock
@@ -2525,7 +2521,7 @@ def _consolidate_inplace(self):
             self._known_consolidated = True
             self._rebuild_blknos_and_blklocs()
 
-    def get(self, item):
+    def get(self, item, fastpath=True):
         """
         Return values for selected item (ndarray or BlockManager).
         """
@@ -2543,7 +2539,7 @@ def get(self, item):
                     else:
                         raise ValueError("cannot label index with a null key")
 
-            return self.iget(loc)
+            return self.iget(loc, fastpath=fastpath)
         else:
 
             if isnull(item):
@@ -2553,8 +2549,25 @@ def get(self, item):
             return self.reindex_indexer(new_axis=self.items[indexer],
                                         indexer=indexer, axis=0, allow_dups=True)
 
-    def iget(self, i):
-        return self.blocks[self._blknos[i]].iget(self._blklocs[i])
+    def iget(self, i, fastpath=True):
+        """
+        Return the data as a SingleBlockManager if fastpath=True and possible
+
+        Otherwise return as a ndarray
+
+        """
+
+        block = self.blocks[self._blknos[i]]
+        values = block.iget(self._blklocs[i])
+        if not fastpath or block.is_sparse or values.ndim != 1:
+            return values
+
+        # fastpath shortcut for select a single-dim from a 2-dim BM
+        return SingleBlockManager([ block.make_block_same_class(values,
+                                                                placement=slice(0, len(values)),
+                                                                fastpath=True) ],
+                                  self.axes[1])
+
 
     def get_scalar(self, tup):
         """

diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py
@@ -4,8 +4,8 @@
 import os
 
 import numpy as np
-
-from pandas import Series, DataFrame, DatetimeIndex, Timestamp
+import nose
+from pandas import Series, DataFrame, DatetimeIndex, Timestamp, _np_version_under1p7
 import pandas as pd
 read_json = pd.read_json
 
@@ -600,11 +600,29 @@ def test_url(self):
         for c in ['created_at', 'closed_at', 'updated_at']:
             self.assertEqual(result[c].dtype, 'datetime64[ns]')
 
-    def test_default_handler(self):
+    def test_timedelta(self):
+        if _np_version_under1p7:
+            raise nose.SkipTest("numpy < 1.7")
+
         from datetime import timedelta
+        converter = lambda x: pd.to_timedelta(x,unit='ms')
+
+        s = Series([timedelta(23), timedelta(seconds=5)])
+        self.assertEqual(s.dtype,'timedelta64[ns]')
+        assert_series_equal(s, pd.read_json(s.to_json(),typ='series').apply(converter))
+
         frame = DataFrame([timedelta(23), timedelta(seconds=5)])
+        self.assertEqual(frame[0].dtype,'timedelta64[ns]')
+        assert_frame_equal(
+            frame, pd.read_json(frame.to_json()).apply(converter))
+
+    def test_default_handler(self):
+        from datetime import timedelta
+
+        frame = DataFrame([timedelta(23), timedelta(seconds=5), 42])
         self.assertRaises(OverflowError, frame.to_json)
-        expected = DataFrame([str(timedelta(23)), str(timedelta(seconds=5))])
+
+        expected = DataFrame([str(timedelta(23)), str(timedelta(seconds=5)), 42])
         assert_frame_equal(
             expected, pd.read_json(frame.to_json(default_handler=str)))
 

diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
@@ -172,6 +172,27 @@ def infer_dtype_list(list values):
     pass
 
 
+def is_possible_datetimelike_array(object arr):
+    # determine if we have a possible datetimelike (or null-like) array
+    cdef:
+        Py_ssize_t i, n = len(arr)
+        bint seen_timedelta = 0, seen_datetime = 0
+        object v
+
+    for i in range(n):
+        v = arr[i]
+        if util.is_string_object(v):
+           continue
+        elif util._checknull(v):
+           continue
+        elif is_datetime(v):
+           seen_datetime=1
+        elif is_timedelta(v):
+           seen_timedelta=1
+        else:
+           return False
+    return seen_datetime or seen_timedelta
+
 cdef inline bint is_null_datetimelike(v):
     # determine if we have a null for a timedelta/datetime (or integer versions)x
     if util._checknull(v):
@@ -331,61 +352,84 @@ def is_unicode_array(ndarray values):
 
 
 def is_datetime_array(ndarray[object] values):
-    cdef int i, n = len(values)
+    cdef int i, null_count = 0, n = len(values)
     cdef object v
     if n == 0:
         return False
+
+    # return False for all nulls
     for i in range(n):
         v = values[i]
-        if not (is_datetime(v) or is_null_datetimelike(v)):
+        if is_null_datetimelike(v):
+            # we are a regular null
+            if util._checknull(v):
+               null_count += 1
+        elif not is_datetime(v):
             return False
-    return True
-
+    return null_count != n
 
 def is_datetime64_array(ndarray values):
-    cdef int i, n = len(values)
+    cdef int i, null_count = 0, n = len(values)
     cdef object v
     if n == 0:
         return False
+
+    # return False for all nulls
     for i in range(n):
         v = values[i]
-        if not (util.is_datetime64_object(v) or is_null_datetimelike(v)):
+        if is_null_datetimelike(v):
+            # we are a regular null
+            if util._checknull(v):
+                null_count += 1
+        elif not util.is_datetime64_object(v):
             return False
-    return True
+    return null_count != n
 
 def is_timedelta_array(ndarray values):
-    cdef int i, n = len(values)
+    cdef int i, null_count = 0, n = len(values)
     cdef object v
     if n == 0:
         return False
     for i in range(n):
         v = values[i]
-        if not (PyDelta_Check(v) or is_null_datetimelike(v)):
+        if is_null_datetimelike(v):
+            # we are a regular null
+            if util._checknull(v):
+                null_count += 1
+        elif not PyDelta_Check(v):
             return False
-    return True
+    return null_count != n
 
 def is_timedelta64_array(ndarray values):
-    cdef int i, n = len(values)
+    cdef int i, null_count = 0, n = len(values)
     cdef object v
     if n == 0:
         return False
     for i in range(n):
         v = values[i]
-        if not (util.is_timedelta64_object(v) or is_null_datetimelike(v)):
+        if is_null_datetimelike(v):
+            # we are a regular null
+            if util._checknull(v):
+                null_count += 1
+        elif not util.is_timedelta64_object(v):
             return False
-    return True
+    return null_count != n
 
 def is_timedelta_or_timedelta64_array(ndarray values):
     """ infer with timedeltas and/or nat/none """
-    cdef int i, n = len(values)
+    cdef int i, null_count = 0, n = len(values)
     cdef object v
     if n == 0:
         return False
     for i in range(n):
         v = values[i]
-        if not (is_timedelta(v) or is_null_datetimelike(v)):
+        if is_null_datetimelike(v):
+            # we are a regular null
+            if util._checknull(v):
+                null_count += 1
+        elif not is_timedelta(v):
             return False
-    return True
+    return null_count != n
 
 def is_date_array(ndarray[object] values):
     cdef int i, n = len(values)