pandas-dev · jreback · Jul 30, 2018 · Jul 7, 2018 · Jul 8, 2018 · Jul 8, 2018
diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
@@ -322,7 +322,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'):
     if unit == 'ns':
         if issubclass(values.dtype.type, np.integer):
             return values.astype('M8[ns]')
-        return array_to_datetime(values.astype(object), errors=errors)
+        return array_to_datetime(values.astype(object), errors=errors)[0]
 
     m = cast_from_unit(None, unit)
 
@@ -451,26 +451,47 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
                         dayfirst=False, yearfirst=False,
                         format=None, utc=None,
                         require_iso8601=False):
+    """
+    Converts a 1D array of date-like values to a numpy array of either:
+        1) datetime64[ns] data
+        2) datetime.datetime objects, if OutOfBoundsDatetime or TypeError is encountered
+
+    Also returns a pytz.FixedOffset if an array of strings with the same timezone offset if passed
+    and utc=True is not passed
+
+    Handles datetime.date, datetime.datetime, np.datetime64 objects, numeric, strings
+
+    Returns
+    -------
+    (ndarray, timezone offset) 
+    """
     cdef:
         Py_ssize_t i, n = len(values)
-        object val, py_dt
+        object val, py_dt, tz, tz_out = None
         ndarray[int64_t] iresult
         ndarray[object] oresult
         pandas_datetimestruct dts
         bint utc_convert = bool(utc)
         bint seen_integer = 0
         bint seen_string = 0
         bint seen_datetime = 0
+        bint seen_datetime_offset = 0
         bint is_raise = errors=='raise'
         bint is_ignore = errors=='ignore'
         bint is_coerce = errors=='coerce'
         _TSObject _ts
         int out_local=0, out_tzoffset=0
+        # Can't directly create a ndarray[int] out_local, since most np.array constructors expect
+        # a long dtype, while _string_to_dts expectes purely int, maybe something I am missing?
+        ndarray[int64_t] out_local_values
+        ndarray[int64_t] out_tzoffset_values
 
     # specify error conditions
     assert is_raise or is_ignore or is_coerce
 
     try:
+        out_local_values = np.empty(n, dtype=np.int64)
+        out_tzoffset_values = np.empty(n, dtype=np.int64)
         result = np.empty(n, dtype='M8[ns]')
         iresult = result.view('i8')
         for i in range(n):
@@ -578,7 +599,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
                             raise ValueError("time data {val} doesn't match "
                                              "format specified"
                                              .format(val=val))
-                        return values
+                        return values, tz_out
 
                     try:
                         py_dt = parse_datetime_string(val, dayfirst=dayfirst,
@@ -606,8 +627,11 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
                 else:
                     # No error raised by string_to_dts, pick back up
                     # where we left off
+                    out_tzoffset_values[i] = out_tzoffset
+                    out_local_values[i] = out_local
                     value = dtstruct_to_dt64(&dts)
                     if out_local == 1:
+                        seen_datetime_offset = 1
                         tz = pytz.FixedOffset(out_tzoffset)
                         value = tz_convert_single(value, tz, 'UTC')
                     iresult[i] = value
@@ -625,7 +649,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
                                 raise ValueError("time data {val} doesn't "
                                                  "match format specified"
                                                  .format(val=val))
-                            return values
+                            return values, tz_out
                         raise
 
             else:
@@ -651,7 +675,22 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
             else:
                 raise TypeError
 
-        return result
+        if seen_datetime_offset and not utc_convert:
+            # GH 17697
+            # 1) If all the offsets are equal, then return one pytz.FixedOffset for the
+            #    parsed dates so it can behave nicely with DatetimeIndex
+            # 2) If the offsets are different, then force the parsing down the object path
+            #    where an array of datetimes (with individual datutil.tzoffsets) are returned
+
+            # Faster to compare integers than to compare objects
+            is_same_offsets = (out_tzoffset_values[0] == out_tzoffset_values).all()
+            if not is_same_offsets:
+                raise TypeError
+            else:
+                # Open question: should this return dateutil offset or pytz offset?
+                tz_out = pytz.FixedOffset(out_tzoffset_values[0])
+
+        return result, tz_out
     except OutOfBoundsDatetime:
         if is_raise:
             raise
@@ -673,7 +712,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
                     oresult[i] = val.item()
             else:
                 oresult[i] = val
-        return oresult
+        return oresult, tz_out
     except TypeError:
         oresult = np.empty(n, dtype=object)
 
@@ -695,14 +734,13 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
                 except Exception:
                     if is_raise:
                         raise
-                    return values
-                    # oresult[i] = val
+                    return values, tz_out
             else:
                 if is_raise:
                     raise
-                return values
+                return values, tz_out
 
-        return oresult
+        return oresult, tz_out
 
 
 cdef inline bint _parse_today_now(str val, int64_t* iresult):

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -917,7 +917,7 @@ def try_datetime(v):
             # GH19671
             v = tslib.array_to_datetime(v,
                                         require_iso8601=True,
-                                        errors='raise')
+                                        errors='raise')[0]
         except ValueError:
 
             # we might have a sequence of the same-datetimes with tz's

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -266,14 +266,17 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None,
                         result = arg
 
         if result is None and (format is None or infer_datetime_format):
-            result = tslib.array_to_datetime(
+            result, tz_parsed = tslib.array_to_datetime(
                 arg,
                 errors=errors,
                 utc=tz == 'utc',
                 dayfirst=dayfirst,
                 yearfirst=yearfirst,
                 require_iso8601=require_iso8601
             )
+            if tz_parsed is not None:
+                return DatetimeIndex._simple_new(result, name=name,
+                                                 tz=tz_parsed)
 
         if is_datetime64_dtype(result) and box:
             result = DatetimeIndex(result, tz=tz, name=name)
@@ -696,7 +699,7 @@ def calc(carg):
         parsed = parsing.try_parse_year_month_day(carg / 10000,
                                                   carg / 100 % 100,
                                                   carg % 100)
-        return tslib.array_to_datetime(parsed, errors=errors)
+        return tslib.array_to_datetime(parsed, errors=errors)[0]
 
     def calc_with_mask(carg, mask):
         result = np.empty(carg.shape, dtype='M8[ns]')

diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py
@@ -7,6 +7,7 @@
 import dateutil
 import numpy as np
 from dateutil.parser import parse
+from dateutil.tz.tz import tzoffset
 from datetime import datetime, time
 from distutils.version import LooseVersion
 
@@ -577,6 +578,36 @@ def test_week_without_day_and_calendar_year(self, date, format):
         with tm.assert_raises_regex(ValueError, msg):
             pd.to_datetime(date, format=format)
 
+    def test_ts_strings_with_same_offset(self):
+        # GH 17697, 11736
+        ts_str = "2015-11-18 15:30:00+05:30"
+        result = to_datetime(ts_str)
+        expected = Timestamp(ts_str)
+        assert result == expected
+
+        expected = DatetimeIndex([Timestamp(ts_str)] * 2)
+        result = to_datetime([ts_str] * 2)
+        tm.assert_index_equal(result, expected)
+
+        result = DatetimeIndex([ts_str] * 2)
+        tm.assert_index_equal(result, expected)
+
+    def test_ts_strings_with_different_offsets(self):
+        # GH 17697, 11736
+        ts_strings = ["2015-11-18 15:30:00+05:30",
+                      "2015-11-18 16:30:00+06:30"]
+        result = to_datetime(ts_strings)
+        expected = np.array([datetime(2015, 11, 18, 15, 30,
+                                      tzinfo=tzoffset(None, 19800)),
+                             datetime(2015, 11, 18, 16, 30,
+                                      tzinfo=tzoffset(None, 23400))],
+                            dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = to_datetime(ts_strings, utc=True)
+        expected = DatetimeIndex([Timestamp(2015, 11, 18, 10)] * 2, tz='UTC')
+        tm.assert_index_equal(result, expected)
+
 
 class TestToDatetimeUnit(object):
     @pytest.mark.parametrize('cache', [True, False])
@@ -1031,7 +1062,7 @@ def test_string_na_nat_conversion(self, cache):
             else:
                 expected[i] = parse_date(val)
 
-        result = tslib.array_to_datetime(strings)
+        result = tslib.array_to_datetime(strings)[0]
         tm.assert_almost_equal(result, expected)
 
         result2 = to_datetime(strings, cache=cache)
@@ -1495,23 +1526,19 @@ def test_parsers_time(self):
         assert res == expected_arr
 
     @pytest.mark.parametrize('cache', [True, False])
-    def test_parsers_timezone_minute_offsets_roundtrip(self, cache):
+    @pytest.mark.parametrize('dt_string, tz, dt_string_repr', [
+        ('2013-01-01 05:45+0545', pytz.FixedOffset(345),
+         "Timestamp('2013-01-01 05:45:00+0545', tz='pytz.FixedOffset(345)')"),
+        ('2013-01-01 05:30+0530', pytz.FixedOffset(330),
+         "Timestamp('2013-01-01 05:30:00+0530', tz='pytz.FixedOffset(330)')")])
+    def test_parsers_timezone_minute_offsets_roundtrip(self, cache, dt_string,
+                                                       tz, dt_string_repr):
         # GH11708
         base = to_datetime("2013-01-01 00:00:00", cache=cache)
-        dt_strings = [
-            ('2013-01-01 05:45+0545',
-             "Asia/Katmandu",
-             "Timestamp('2013-01-01 05:45:00+0545', tz='Asia/Katmandu')"),
-            ('2013-01-01 05:30+0530',
-             "Asia/Kolkata",
-             "Timestamp('2013-01-01 05:30:00+0530', tz='Asia/Kolkata')")
-        ]
-
-        for dt_string, tz, dt_string_repr in dt_strings:
-            dt_time = to_datetime(dt_string, cache=cache)
-            assert base == dt_time
-            converted_time = dt_time.tz_localize('UTC').tz_convert(tz)
-            assert dt_string_repr == repr(converted_time)
+        base = base.tz_localize('UTC').tz_convert(tz)
+        dt_time = to_datetime(dt_string, cache=cache)
+        assert base == dt_time
+        assert dt_string_repr == repr(dt_time)
 
 
 @pytest.fixture(params=['D', 's', 'ms', 'us', 'ns'])

diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pytest
+import pytz
+from dateutil.tz.tz import tzoffset
 
 from pandas._libs import tslib
 from pandas.compat.numpy import np_array_datetime64_compat
@@ -52,46 +54,64 @@ def test_parsers_iso8601_invalid(self, date_str):
 class TestArrayToDatetime(object):
     def test_parsing_valid_dates(self):
         arr = np.array(['01-01-2013', '01-02-2013'], dtype=object)
-        result = tslib.array_to_datetime(arr)
+        result = tslib.array_to_datetime(arr)[0]
         expected = ['2013-01-01T00:00:00.000000000-0000',
                     '2013-01-02T00:00:00.000000000-0000']
         tm.assert_numpy_array_equal(
             result,
             np_array_datetime64_compat(expected, dtype='M8[ns]'))
 
         arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object)
-        result = tslib.array_to_datetime(arr)
+        result = tslib.array_to_datetime(arr)[0]
         expected = ['2013-09-16T00:00:00.000000000-0000',
                     '2013-09-17T00:00:00.000000000-0000']
         tm.assert_numpy_array_equal(
             result,
             np_array_datetime64_compat(expected, dtype='M8[ns]'))
 
-    @pytest.mark.parametrize('dt_string', [
-        '01-01-2013 08:00:00+08:00',
-        '2013-01-01T08:00:00.000000000+0800',
-        '2012-12-31T16:00:00.000000000-0800',
-        '12-31-2012 23:00:00-01:00'])
-    def test_parsing_timezone_offsets(self, dt_string):
+    @pytest.mark.parametrize('dt_string, expected_tz', [
+        ['01-01-2013 08:00:00+08:00', None],
+        ['2013-01-01T08:00:00.000000000+0800', pytz.FixedOffset(480)],
+        ['2012-12-31T16:00:00.000000000-0800', pytz.FixedOffset(-480)],
+        ['12-31-2012 23:00:00-01:00', None]])
+    def test_parsing_timezone_offsets(self, dt_string, expected_tz):
         # All of these datetime strings with offsets are equivalent
         # to the same datetime after the timezone offset is added
+
+        # TODO: Appears that parsing non-ISO strings adjust the date to UTC
+        # but don't return the offset. Not sure if this is the intended
+        # behavior of non-iso strings in np_datetime_strings
         arr = np.array(['01-01-2013 00:00:00'], dtype=object)
-        expected = tslib.array_to_datetime(arr)
+        expected = tslib.array_to_datetime(arr)[0]
 
         arr = np.array([dt_string], dtype=object)
-        result = tslib.array_to_datetime(arr)
+        result, result_tz = tslib.array_to_datetime(arr)
+        tm.assert_numpy_array_equal(result, expected)
+        assert result_tz is expected_tz
+
+    def test_parsing_different_timezone_offsets(self):
+        # GH 17697
+        data = ["2015-11-18 15:30:00+05:30", "2015-11-18 15:30:00+06:30"]
+        data = np.array(data, dtype=object)
+        result, result_tz = tslib.array_to_datetime(data)
+        expected = np.array([datetime(2015, 11, 18, 15, 30,
+                                      tzinfo=tzoffset(None, 19800)),
+                             datetime(2015, 11, 18, 15, 30,
+                                      tzinfo=tzoffset(None, 23400))],
+                            dtype=object)
         tm.assert_numpy_array_equal(result, expected)
+        assert result_tz is None
 
     def test_number_looking_strings_not_into_datetime(self):
         # GH#4601
         # These strings don't look like datetimes so they shouldn't be
         # attempted to be converted
         arr = np.array(['-352.737091', '183.575577'], dtype=object)
-        result = tslib.array_to_datetime(arr, errors='ignore')
+        result = tslib.array_to_datetime(arr, errors='ignore')[0]
         tm.assert_numpy_array_equal(result, arr)
 
         arr = np.array(['1', '2', '3', '4', '5'], dtype=object)
-        result = tslib.array_to_datetime(arr, errors='ignore')
+        result = tslib.array_to_datetime(arr, errors='ignore')[0]
         tm.assert_numpy_array_equal(result, arr)
 
     @pytest.mark.parametrize('invalid_date', [
@@ -105,13 +125,13 @@ def test_coerce_outside_ns_bounds(self, invalid_date):
         with pytest.raises(ValueError):
             tslib.array_to_datetime(arr, errors='raise')
 
-        result = tslib.array_to_datetime(arr, errors='coerce')
+        result = tslib.array_to_datetime(arr, errors='coerce')[0]
         expected = np.array([tslib.iNaT], dtype='M8[ns]')
         tm.assert_numpy_array_equal(result, expected)
 
     def test_coerce_outside_ns_bounds_one_valid(self):
         arr = np.array(['1/1/1000', '1/1/2000'], dtype=object)
-        result = tslib.array_to_datetime(arr, errors='coerce')
+        result = tslib.array_to_datetime(arr, errors='coerce')[0]
         expected = [tslib.iNaT,
                     '2000-01-01T00:00:00.000000000-0000']
         tm.assert_numpy_array_equal(
@@ -123,11 +143,11 @@ def test_coerce_of_invalid_datetimes(self):
 
         # Without coercing, the presence of any invalid dates prevents
         # any values from being converted
-        result = tslib.array_to_datetime(arr, errors='ignore')
+        result = tslib.array_to_datetime(arr, errors='ignore')[0]
         tm.assert_numpy_array_equal(result, arr)
 
         # With coercing, the invalid dates becomes iNaT
-        result = tslib.array_to_datetime(arr, errors='coerce')
+        result = tslib.array_to_datetime(arr, errors='coerce')[0]
         expected = ['2013-01-01T00:00:00.000000000-0000',
                     tslib.iNaT,
                     tslib.iNaT]