Skip to content

BUG/API: to_datetime preserves UTC offsets when parsing datetime strings #21822

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 43 commits into from
Jul 30, 2018
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
ac5a3d1
BUG: to_datetime no longer converts offsets to UTC
Jul 7, 2018
b81a8e9
Merge remote-tracking branch 'upstream/master' into parse_tz_offsets
Jul 8, 2018
6bf46a8
Document and now return offset
Jul 8, 2018
678b337
Add some tests, start converting some existing uses of array_to_datetime
Jul 8, 2018
1917148
Add more tests
Jul 8, 2018
581a33e
Adjust test
Jul 8, 2018
a1bc8f9
Flake8
Jul 8, 2018
80042e6
Add tests confirming new behavior
Jul 8, 2018
7c4135e
Merge remote-tracking branch 'upstream/master' into parse_tz_offsets
Jul 10, 2018
0651416
Merge remote-tracking branch 'upstream/master' into parse_tz_offsets
Jul 11, 2018
bacb6e3
Lint
Jul 11, 2018
a2f4aad
adjust a test
Jul 11, 2018
d48f341
Ensure box object index, pass tests
Jul 11, 2018
7efb25c
Adjust tests
Jul 11, 2018
1d527ff
Adjust test
Jul 11, 2018
f89d6b6
Cleanup and add comments
Jul 12, 2018
d91c63f
address comments
Jul 12, 2018
1054e8b
adjust test to be closer to the original behavior
Jul 12, 2018
d9cdc91
Merge remote-tracking branch 'upstream/master' into parse_tz_offsets
Jul 12, 2018
7d04613
Add TypeError clause
Jul 12, 2018
031284c
Add TypeError not ValueError
Jul 12, 2018
749e62e
Merge remote-tracking branch 'upstream/master' into parse_tz_offsets
Jul 12, 2018
23cbf75
fix typo
Jul 12, 2018
1e6f87a
Merge remote-tracking branch 'upstream/master' into parse_tz_offsets
Jul 18, 2018
7539bcf
Merge remote-tracking branch 'upstream/master' into parse_tz_offsets
Jul 19, 2018
c1f51cd
New implimentation
Jul 19, 2018
db75a24
Merge remote-tracking branch 'upstream/master' into parse_tz_offsets
Jul 20, 2018
4733ac5
Change implimentation and add some tests
Jul 20, 2018
e3db735
Merge remote-tracking branch 'upstream/master' into parse_tz_offsets
Jul 20, 2018
2fa681f
Add missing commas
Jul 20, 2018
5f36c98
Merge remote-tracking branch 'upstream/master' into parse_tz_offsets
Jul 24, 2018
d7ff275
Change implimentation since tzoffsets cannot be hashed
Jul 25, 2018
4ff7cb3
Add whatsnew
Jul 25, 2018
8463d91
Address review
Jul 25, 2018
dddc6b3
Address tzlocal
Jul 25, 2018
cca3983
Change is to == for older dateutil compat
Jul 26, 2018
e441be0
Merge remote-tracking branch 'upstream/master' into parse_tz_offsets
Jul 26, 2018
a8a65f7
Modify example in whatsnew to display
Jul 26, 2018
75f9fd9
Merge remote-tracking branch 'upstream/master' into parse_tz_offsets
Jul 26, 2018
6052475
Add more specific errors
Jul 27, 2018
f916c69
Merge remote-tracking branch 'upstream/master' into parse_tz_offsets
Jul 28, 2018
807a251
Merge remote-tracking branch 'upstream/master' into parse_tz_offsets
Jul 29, 2018
1cbd9b9
Add some benchmarks and reformat tests
Jul 30, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 48 additions & 10 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'):
if unit == 'ns':
if issubclass(values.dtype.type, np.integer):
return values.astype('M8[ns]')
return array_to_datetime(values.astype(object), errors=errors)
return array_to_datetime(values.astype(object), errors=errors)[0]

m = cast_from_unit(None, unit)

Expand Down Expand Up @@ -451,26 +451,47 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
dayfirst=False, yearfirst=False,
format=None, utc=None,
require_iso8601=False):
"""
Converts a 1D array of date-like values to a numpy array of either:
1) datetime64[ns] data
2) datetime.datetime objects, if OutOfBoundsDatetime or TypeError is encountered

Also returns a pytz.FixedOffset if an array of strings with the same timezone offset if passed
and utc=True is not passed

Handles datetime.date, datetime.datetime, np.datetime64 objects, numeric, strings

Returns
-------
(ndarray, timezone offset)
"""
cdef:
Py_ssize_t i, n = len(values)
object val, py_dt
object val, py_dt, tz, tz_out = None
ndarray[int64_t] iresult
ndarray[object] oresult
pandas_datetimestruct dts
bint utc_convert = bool(utc)
bint seen_integer = 0
bint seen_string = 0
bint seen_datetime = 0
bint seen_datetime_offset = 0
bint is_raise = errors=='raise'
bint is_ignore = errors=='ignore'
bint is_coerce = errors=='coerce'
_TSObject _ts
int out_local=0, out_tzoffset=0
# Can't directly create a ndarray[int] out_local, since most np.array constructors expect
# a long dtype, while _string_to_dts expectes purely int, maybe something I am missing?
ndarray[int64_t] out_local_values
ndarray[int64_t] out_tzoffset_values

# specify error conditions
assert is_raise or is_ignore or is_coerce

try:
out_local_values = np.empty(n, dtype=np.int64)
out_tzoffset_values = np.empty(n, dtype=np.int64)
result = np.empty(n, dtype='M8[ns]')
iresult = result.view('i8')
for i in range(n):
Expand Down Expand Up @@ -578,7 +599,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
raise ValueError("time data {val} doesn't match "
"format specified"
.format(val=val))
return values
return values, tz_out

try:
py_dt = parse_datetime_string(val, dayfirst=dayfirst,
Expand Down Expand Up @@ -606,8 +627,11 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
else:
# No error raised by string_to_dts, pick back up
# where we left off
out_tzoffset_values[i] = out_tzoffset
out_local_values[i] = out_local
value = dtstruct_to_dt64(&dts)
if out_local == 1:
seen_datetime_offset = 1
tz = pytz.FixedOffset(out_tzoffset)
value = tz_convert_single(value, tz, 'UTC')
iresult[i] = value
Expand All @@ -625,7 +649,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
raise ValueError("time data {val} doesn't "
"match format specified"
.format(val=val))
return values
return values, tz_out
raise

else:
Expand All @@ -651,7 +675,22 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
else:
raise TypeError

return result
if seen_datetime_offset and not utc_convert:
# GH 17697
# 1) If all the offsets are equal, then return one pytz.FixedOffset for the
# parsed dates so it can behave nicely with DatetimeIndex
# 2) If the offsets are different, then force the parsing down the object path
# where an array of datetimes (with individual datutil.tzoffsets) are returned

# Faster to compare integers than to compare objects
is_same_offsets = (out_tzoffset_values[0] == out_tzoffset_values).all()
if not is_same_offsets:
raise TypeError
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This (pre-existing) pattern is pretty opaque to a first-time reader. What if instead of raising TypeError the fallback block became its own function that gets called from here?

else:
# Open question: should this return dateutil offset or pytz offset?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

default to dateutil

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this PR then, is it okay that parsing through Timestamp will produce a pytz.FixedOffset and to_datetime will producea dateutil.tz.tzoffset?

I should probably start a larger discussion whether we should be migrating from pytz to dateutil

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My mistake, forgot that Timestamp defaulted to pytz.FixedOffset. Sharing code with the Timestamp constructor is definitely a higher priority than defaulting to dateutil.tz.

tz_out = pytz.FixedOffset(out_tzoffset_values[0])

return result, tz_out
except OutOfBoundsDatetime:
if is_raise:
raise
Expand All @@ -673,7 +712,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
oresult[i] = val.item()
else:
oresult[i] = val
return oresult
return oresult, tz_out
except TypeError:
oresult = np.empty(n, dtype=object)

Expand All @@ -695,14 +734,13 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
except Exception:
if is_raise:
raise
return values
# oresult[i] = val
return values, tz_out
else:
if is_raise:
raise
return values
return values, tz_out

return oresult
return oresult, tz_out


cdef inline bint _parse_today_now(str val, int64_t* iresult):
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -917,7 +917,7 @@ def try_datetime(v):
# GH19671
v = tslib.array_to_datetime(v,
require_iso8601=True,
errors='raise')
errors='raise')[0]
except ValueError:

# we might have a sequence of the same-datetimes with tz's
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,14 +266,17 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None,
result = arg

if result is None and (format is None or infer_datetime_format):
result = tslib.array_to_datetime(
result, tz_parsed = tslib.array_to_datetime(
arg,
errors=errors,
utc=tz == 'utc',
dayfirst=dayfirst,
yearfirst=yearfirst,
require_iso8601=require_iso8601
)
if tz_parsed is not None:
return DatetimeIndex._simple_new(result, name=name,
tz=tz_parsed)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

case with multiple tzs that has to get wrapped in object-dtype?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That case will result in tz_parsed = None so this branch will not be hit.


if is_datetime64_dtype(result) and box:
result = DatetimeIndex(result, tz=tz, name=name)
Expand Down Expand Up @@ -696,7 +699,7 @@ def calc(carg):
parsed = parsing.try_parse_year_month_day(carg / 10000,
carg / 100 % 100,
carg % 100)
return tslib.array_to_datetime(parsed, errors=errors)
return tslib.array_to_datetime(parsed, errors=errors)[0]

def calc_with_mask(carg, mask):
result = np.empty(carg.shape, dtype='M8[ns]')
Expand Down
59 changes: 43 additions & 16 deletions pandas/tests/indexes/datetimes/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import dateutil
import numpy as np
from dateutil.parser import parse
from dateutil.tz.tz import tzoffset
from datetime import datetime, time
from distutils.version import LooseVersion

Expand Down Expand Up @@ -577,6 +578,36 @@ def test_week_without_day_and_calendar_year(self, date, format):
with tm.assert_raises_regex(ValueError, msg):
pd.to_datetime(date, format=format)

def test_ts_strings_with_same_offset(self):
# GH 17697, 11736
ts_str = "2015-11-18 15:30:00+05:30"
result = to_datetime(ts_str)
expected = Timestamp(ts_str)
assert result == expected

expected = DatetimeIndex([Timestamp(ts_str)] * 2)
result = to_datetime([ts_str] * 2)
tm.assert_index_equal(result, expected)

result = DatetimeIndex([ts_str] * 2)
tm.assert_index_equal(result, expected)

def test_ts_strings_with_different_offsets(self):
# GH 17697, 11736
ts_strings = ["2015-11-18 15:30:00+05:30",
"2015-11-18 16:30:00+06:30"]
result = to_datetime(ts_strings)
expected = np.array([datetime(2015, 11, 18, 15, 30,
tzinfo=tzoffset(None, 19800)),
datetime(2015, 11, 18, 16, 30,
tzinfo=tzoffset(None, 23400))],
dtype=object)
tm.assert_numpy_array_equal(result, expected)

result = to_datetime(ts_strings, utc=True)
expected = DatetimeIndex([Timestamp(2015, 11, 18, 10)] * 2, tz='UTC')
tm.assert_index_equal(result, expected)


class TestToDatetimeUnit(object):
@pytest.mark.parametrize('cache', [True, False])
Expand Down Expand Up @@ -1031,7 +1062,7 @@ def test_string_na_nat_conversion(self, cache):
else:
expected[i] = parse_date(val)

result = tslib.array_to_datetime(strings)
result = tslib.array_to_datetime(strings)[0]
tm.assert_almost_equal(result, expected)

result2 = to_datetime(strings, cache=cache)
Expand Down Expand Up @@ -1495,23 +1526,19 @@ def test_parsers_time(self):
assert res == expected_arr

@pytest.mark.parametrize('cache', [True, False])
def test_parsers_timezone_minute_offsets_roundtrip(self, cache):
@pytest.mark.parametrize('dt_string, tz, dt_string_repr', [
('2013-01-01 05:45+0545', pytz.FixedOffset(345),
"Timestamp('2013-01-01 05:45:00+0545', tz='pytz.FixedOffset(345)')"),
('2013-01-01 05:30+0530', pytz.FixedOffset(330),
"Timestamp('2013-01-01 05:30:00+0530', tz='pytz.FixedOffset(330)')")])
def test_parsers_timezone_minute_offsets_roundtrip(self, cache, dt_string,
tz, dt_string_repr):
# GH11708
base = to_datetime("2013-01-01 00:00:00", cache=cache)
dt_strings = [
('2013-01-01 05:45+0545',
"Asia/Katmandu",
"Timestamp('2013-01-01 05:45:00+0545', tz='Asia/Katmandu')"),
('2013-01-01 05:30+0530',
"Asia/Kolkata",
"Timestamp('2013-01-01 05:30:00+0530', tz='Asia/Kolkata')")
]

for dt_string, tz, dt_string_repr in dt_strings:
dt_time = to_datetime(dt_string, cache=cache)
assert base == dt_time
converted_time = dt_time.tz_localize('UTC').tz_convert(tz)
assert dt_string_repr == repr(converted_time)
base = base.tz_localize('UTC').tz_convert(tz)
dt_time = to_datetime(dt_string, cache=cache)
assert base == dt_time
assert dt_string_repr == repr(dt_time)


@pytest.fixture(params=['D', 's', 'ms', 'us', 'ns'])
Expand Down
52 changes: 36 additions & 16 deletions pandas/tests/tslibs/test_array_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import numpy as np
import pytest
import pytz
from dateutil.tz.tz import tzoffset

from pandas._libs import tslib
from pandas.compat.numpy import np_array_datetime64_compat
Expand Down Expand Up @@ -52,46 +54,64 @@ def test_parsers_iso8601_invalid(self, date_str):
class TestArrayToDatetime(object):
def test_parsing_valid_dates(self):
arr = np.array(['01-01-2013', '01-02-2013'], dtype=object)
result = tslib.array_to_datetime(arr)
result = tslib.array_to_datetime(arr)[0]
expected = ['2013-01-01T00:00:00.000000000-0000',
'2013-01-02T00:00:00.000000000-0000']
tm.assert_numpy_array_equal(
result,
np_array_datetime64_compat(expected, dtype='M8[ns]'))

arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object)
result = tslib.array_to_datetime(arr)
result = tslib.array_to_datetime(arr)[0]
expected = ['2013-09-16T00:00:00.000000000-0000',
'2013-09-17T00:00:00.000000000-0000']
tm.assert_numpy_array_equal(
result,
np_array_datetime64_compat(expected, dtype='M8[ns]'))

@pytest.mark.parametrize('dt_string', [
'01-01-2013 08:00:00+08:00',
'2013-01-01T08:00:00.000000000+0800',
'2012-12-31T16:00:00.000000000-0800',
'12-31-2012 23:00:00-01:00'])
def test_parsing_timezone_offsets(self, dt_string):
@pytest.mark.parametrize('dt_string, expected_tz', [
['01-01-2013 08:00:00+08:00', None],
['2013-01-01T08:00:00.000000000+0800', pytz.FixedOffset(480)],
['2012-12-31T16:00:00.000000000-0800', pytz.FixedOffset(-480)],
['12-31-2012 23:00:00-01:00', None]])
def test_parsing_timezone_offsets(self, dt_string, expected_tz):
# All of these datetime strings with offsets are equivalent
# to the same datetime after the timezone offset is added

# TODO: Appears that parsing non-ISO strings adjust the date to UTC
# but don't return the offset. Not sure if this is the intended
# behavior of non-iso strings in np_datetime_strings
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

np_datetime_strings doesn't handle non-ISO. That case ends up going through dateutil (via parse_datetime_string). I'm not totally sure what the TODO is for. '01-01-2013 00:00:00' comes back tz-naive, right?

arr = np.array(['01-01-2013 00:00:00'], dtype=object)
expected = tslib.array_to_datetime(arr)
expected = tslib.array_to_datetime(arr)[0]

arr = np.array([dt_string], dtype=object)
result = tslib.array_to_datetime(arr)
result, result_tz = tslib.array_to_datetime(arr)
tm.assert_numpy_array_equal(result, expected)
assert result_tz is expected_tz

def test_parsing_different_timezone_offsets(self):
# GH 17697
data = ["2015-11-18 15:30:00+05:30", "2015-11-18 15:30:00+06:30"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have these same tests using to_datetime? testing the cython function is obviously useful as well

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

data = np.array(data, dtype=object)
result, result_tz = tslib.array_to_datetime(data)
expected = np.array([datetime(2015, 11, 18, 15, 30,
tzinfo=tzoffset(None, 19800)),
datetime(2015, 11, 18, 15, 30,
tzinfo=tzoffset(None, 23400))],
dtype=object)
tm.assert_numpy_array_equal(result, expected)
assert result_tz is None

def test_number_looking_strings_not_into_datetime(self):
# GH#4601
# These strings don't look like datetimes so they shouldn't be
# attempted to be converted
arr = np.array(['-352.737091', '183.575577'], dtype=object)
result = tslib.array_to_datetime(arr, errors='ignore')
result = tslib.array_to_datetime(arr, errors='ignore')[0]
tm.assert_numpy_array_equal(result, arr)

arr = np.array(['1', '2', '3', '4', '5'], dtype=object)
result = tslib.array_to_datetime(arr, errors='ignore')
result = tslib.array_to_datetime(arr, errors='ignore')[0]
tm.assert_numpy_array_equal(result, arr)

@pytest.mark.parametrize('invalid_date', [
Expand All @@ -105,13 +125,13 @@ def test_coerce_outside_ns_bounds(self, invalid_date):
with pytest.raises(ValueError):
tslib.array_to_datetime(arr, errors='raise')

result = tslib.array_to_datetime(arr, errors='coerce')
result = tslib.array_to_datetime(arr, errors='coerce')[0]
expected = np.array([tslib.iNaT], dtype='M8[ns]')
tm.assert_numpy_array_equal(result, expected)

def test_coerce_outside_ns_bounds_one_valid(self):
arr = np.array(['1/1/1000', '1/1/2000'], dtype=object)
result = tslib.array_to_datetime(arr, errors='coerce')
result = tslib.array_to_datetime(arr, errors='coerce')[0]
expected = [tslib.iNaT,
'2000-01-01T00:00:00.000000000-0000']
tm.assert_numpy_array_equal(
Expand All @@ -123,11 +143,11 @@ def test_coerce_of_invalid_datetimes(self):

# Without coercing, the presence of any invalid dates prevents
# any values from being converted
result = tslib.array_to_datetime(arr, errors='ignore')
result = tslib.array_to_datetime(arr, errors='ignore')[0]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like writing these like

result, _ = ...... I think its more clear

tm.assert_numpy_array_equal(result, arr)

# With coercing, the invalid dates becomes iNaT
result = tslib.array_to_datetime(arr, errors='coerce')
result = tslib.array_to_datetime(arr, errors='coerce')[0]
expected = ['2013-01-01T00:00:00.000000000-0000',
tslib.iNaT,
tslib.iNaT]
Expand Down