-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG/API: to_datetime preserves UTC offsets when parsing datetime strings #21822
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
ac5a3d1
b81a8e9
6bf46a8
678b337
1917148
581a33e
a1bc8f9
80042e6
7c4135e
0651416
bacb6e3
a2f4aad
d48f341
7efb25c
1d527ff
f89d6b6
d91c63f
1054e8b
d9cdc91
7d04613
031284c
749e62e
23cbf75
1e6f87a
7539bcf
c1f51cd
db75a24
4733ac5
e3db735
2fa681f
5f36c98
d7ff275
4ff7cb3
8463d91
dddc6b3
cca3983
e441be0
a8a65f7
75f9fd9
6052475
f916c69
807a251
1cbd9b9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -322,7 +322,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): | |
if unit == 'ns': | ||
if issubclass(values.dtype.type, np.integer): | ||
return values.astype('M8[ns]') | ||
return array_to_datetime(values.astype(object), errors=errors) | ||
return array_to_datetime(values.astype(object), errors=errors)[0] | ||
|
||
m = cast_from_unit(None, unit) | ||
|
||
|
@@ -451,26 +451,47 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
dayfirst=False, yearfirst=False, | ||
format=None, utc=None, | ||
require_iso8601=False): | ||
""" | ||
Converts a 1D array of date-like values to a numpy array of either: | ||
1) datetime64[ns] data | ||
2) datetime.datetime objects, if OutOfBoundsDatetime or TypeError is encountered | ||
|
||
Also returns a pytz.FixedOffset if an array of strings with the same timezone offset if passed | ||
and utc=True is not passed | ||
|
||
Handles datetime.date, datetime.datetime, np.datetime64 objects, numeric, strings | ||
|
||
Returns | ||
------- | ||
(ndarray, timezone offset) | ||
""" | ||
cdef: | ||
Py_ssize_t i, n = len(values) | ||
object val, py_dt | ||
object val, py_dt, tz, tz_out = None | ||
ndarray[int64_t] iresult | ||
ndarray[object] oresult | ||
pandas_datetimestruct dts | ||
bint utc_convert = bool(utc) | ||
bint seen_integer = 0 | ||
bint seen_string = 0 | ||
bint seen_datetime = 0 | ||
bint seen_datetime_offset = 0 | ||
bint is_raise = errors=='raise' | ||
bint is_ignore = errors=='ignore' | ||
bint is_coerce = errors=='coerce' | ||
_TSObject _ts | ||
int out_local=0, out_tzoffset=0 | ||
# Can't directly create a ndarray[int] out_local, since most np.array constructors expect | ||
# a long dtype, while _string_to_dts expectes purely int, maybe something I am missing? | ||
ndarray[int64_t] out_local_values | ||
ndarray[int64_t] out_tzoffset_values | ||
|
||
# specify error conditions | ||
assert is_raise or is_ignore or is_coerce | ||
|
||
try: | ||
out_local_values = np.empty(n, dtype=np.int64) | ||
out_tzoffset_values = np.empty(n, dtype=np.int64) | ||
result = np.empty(n, dtype='M8[ns]') | ||
iresult = result.view('i8') | ||
for i in range(n): | ||
|
@@ -578,7 +599,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
raise ValueError("time data {val} doesn't match " | ||
"format specified" | ||
.format(val=val)) | ||
return values | ||
return values, tz_out | ||
|
||
try: | ||
py_dt = parse_datetime_string(val, dayfirst=dayfirst, | ||
|
@@ -606,8 +627,11 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
else: | ||
# No error raised by string_to_dts, pick back up | ||
# where we left off | ||
out_tzoffset_values[i] = out_tzoffset | ||
out_local_values[i] = out_local | ||
value = dtstruct_to_dt64(&dts) | ||
if out_local == 1: | ||
seen_datetime_offset = 1 | ||
tz = pytz.FixedOffset(out_tzoffset) | ||
value = tz_convert_single(value, tz, 'UTC') | ||
iresult[i] = value | ||
|
@@ -625,7 +649,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
raise ValueError("time data {val} doesn't " | ||
"match format specified" | ||
.format(val=val)) | ||
return values | ||
return values, tz_out | ||
raise | ||
|
||
else: | ||
|
@@ -651,7 +675,22 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
else: | ||
raise TypeError | ||
|
||
return result | ||
if seen_datetime_offset and not utc_convert: | ||
# GH 17697 | ||
# 1) If all the offsets are equal, then return one pytz.FixedOffset for the | ||
# parsed dates so it can behave nicely with DatetimeIndex | ||
# 2) If the offsets are different, then force the parsing down the object path | ||
# where an array of datetimes (with individual datutil.tzoffsets) are returned | ||
|
||
# Faster to compare integers than to compare objects | ||
is_same_offsets = (out_tzoffset_values[0] == out_tzoffset_values).all() | ||
if not is_same_offsets: | ||
raise TypeError | ||
else: | ||
# Open question: should this return dateutil offset or pytz offset? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. default to dateutil There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For this PR then, is it okay that parsing through I should probably start a larger discussion whether we should be migrating from pytz to dateutil There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My mistake, forgot that |
||
tz_out = pytz.FixedOffset(out_tzoffset_values[0]) | ||
|
||
return result, tz_out | ||
except OutOfBoundsDatetime: | ||
if is_raise: | ||
raise | ||
|
@@ -673,7 +712,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
oresult[i] = val.item() | ||
else: | ||
oresult[i] = val | ||
return oresult | ||
return oresult, tz_out | ||
except TypeError: | ||
oresult = np.empty(n, dtype=object) | ||
|
||
|
@@ -695,14 +734,13 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
except Exception: | ||
if is_raise: | ||
raise | ||
return values | ||
# oresult[i] = val | ||
return values, tz_out | ||
else: | ||
if is_raise: | ||
raise | ||
return values | ||
return values, tz_out | ||
|
||
return oresult | ||
return oresult, tz_out | ||
|
||
|
||
cdef inline bint _parse_today_now(str val, int64_t* iresult): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -266,14 +266,17 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, | |
result = arg | ||
|
||
if result is None and (format is None or infer_datetime_format): | ||
result = tslib.array_to_datetime( | ||
result, tz_parsed = tslib.array_to_datetime( | ||
arg, | ||
errors=errors, | ||
utc=tz == 'utc', | ||
dayfirst=dayfirst, | ||
yearfirst=yearfirst, | ||
require_iso8601=require_iso8601 | ||
) | ||
if tz_parsed is not None: | ||
return DatetimeIndex._simple_new(result, name=name, | ||
tz=tz_parsed) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. case with multiple tzs that has to get wrapped in object-dtype? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That case will result in |
||
|
||
if is_datetime64_dtype(result) and box: | ||
result = DatetimeIndex(result, tz=tz, name=name) | ||
|
@@ -696,7 +699,7 @@ def calc(carg): | |
parsed = parsing.try_parse_year_month_day(carg / 10000, | ||
carg / 100 % 100, | ||
carg % 100) | ||
return tslib.array_to_datetime(parsed, errors=errors) | ||
return tslib.array_to_datetime(parsed, errors=errors)[0] | ||
|
||
def calc_with_mask(carg, mask): | ||
result = np.empty(carg.shape, dtype='M8[ns]') | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,8 @@ | |
|
||
import numpy as np | ||
import pytest | ||
import pytz | ||
from dateutil.tz.tz import tzoffset | ||
|
||
from pandas._libs import tslib | ||
from pandas.compat.numpy import np_array_datetime64_compat | ||
|
@@ -52,46 +54,64 @@ def test_parsers_iso8601_invalid(self, date_str): | |
class TestArrayToDatetime(object): | ||
def test_parsing_valid_dates(self): | ||
arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) | ||
result = tslib.array_to_datetime(arr) | ||
result = tslib.array_to_datetime(arr)[0] | ||
expected = ['2013-01-01T00:00:00.000000000-0000', | ||
'2013-01-02T00:00:00.000000000-0000'] | ||
tm.assert_numpy_array_equal( | ||
result, | ||
np_array_datetime64_compat(expected, dtype='M8[ns]')) | ||
|
||
arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object) | ||
result = tslib.array_to_datetime(arr) | ||
result = tslib.array_to_datetime(arr)[0] | ||
expected = ['2013-09-16T00:00:00.000000000-0000', | ||
'2013-09-17T00:00:00.000000000-0000'] | ||
tm.assert_numpy_array_equal( | ||
result, | ||
np_array_datetime64_compat(expected, dtype='M8[ns]')) | ||
|
||
@pytest.mark.parametrize('dt_string', [ | ||
'01-01-2013 08:00:00+08:00', | ||
'2013-01-01T08:00:00.000000000+0800', | ||
'2012-12-31T16:00:00.000000000-0800', | ||
'12-31-2012 23:00:00-01:00']) | ||
def test_parsing_timezone_offsets(self, dt_string): | ||
@pytest.mark.parametrize('dt_string, expected_tz', [ | ||
['01-01-2013 08:00:00+08:00', None], | ||
['2013-01-01T08:00:00.000000000+0800', pytz.FixedOffset(480)], | ||
['2012-12-31T16:00:00.000000000-0800', pytz.FixedOffset(-480)], | ||
['12-31-2012 23:00:00-01:00', None]]) | ||
def test_parsing_timezone_offsets(self, dt_string, expected_tz): | ||
# All of these datetime strings with offsets are equivalent | ||
# to the same datetime after the timezone offset is added | ||
|
||
# TODO: Appears that parsing non-ISO strings adjust the date to UTC | ||
# but don't return the offset. Not sure if this is the intended | ||
# behavior of non-iso strings in np_datetime_strings | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. np_datetime_strings doesn't handle non-ISO. That case ends up going through dateutil (via |
||
arr = np.array(['01-01-2013 00:00:00'], dtype=object) | ||
expected = tslib.array_to_datetime(arr) | ||
expected = tslib.array_to_datetime(arr)[0] | ||
|
||
arr = np.array([dt_string], dtype=object) | ||
result = tslib.array_to_datetime(arr) | ||
result, result_tz = tslib.array_to_datetime(arr) | ||
tm.assert_numpy_array_equal(result, expected) | ||
assert result_tz is expected_tz | ||
|
||
def test_parsing_different_timezone_offsets(self): | ||
# GH 17697 | ||
data = ["2015-11-18 15:30:00+05:30", "2015-11-18 15:30:00+06:30"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we have these same tests using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
data = np.array(data, dtype=object) | ||
result, result_tz = tslib.array_to_datetime(data) | ||
expected = np.array([datetime(2015, 11, 18, 15, 30, | ||
tzinfo=tzoffset(None, 19800)), | ||
datetime(2015, 11, 18, 15, 30, | ||
tzinfo=tzoffset(None, 23400))], | ||
dtype=object) | ||
tm.assert_numpy_array_equal(result, expected) | ||
assert result_tz is None | ||
|
||
def test_number_looking_strings_not_into_datetime(self): | ||
# GH#4601 | ||
# These strings don't look like datetimes so they shouldn't be | ||
# attempted to be converted | ||
arr = np.array(['-352.737091', '183.575577'], dtype=object) | ||
result = tslib.array_to_datetime(arr, errors='ignore') | ||
result = tslib.array_to_datetime(arr, errors='ignore')[0] | ||
tm.assert_numpy_array_equal(result, arr) | ||
|
||
arr = np.array(['1', '2', '3', '4', '5'], dtype=object) | ||
result = tslib.array_to_datetime(arr, errors='ignore') | ||
result = tslib.array_to_datetime(arr, errors='ignore')[0] | ||
tm.assert_numpy_array_equal(result, arr) | ||
|
||
@pytest.mark.parametrize('invalid_date', [ | ||
|
@@ -105,13 +125,13 @@ def test_coerce_outside_ns_bounds(self, invalid_date): | |
with pytest.raises(ValueError): | ||
tslib.array_to_datetime(arr, errors='raise') | ||
|
||
result = tslib.array_to_datetime(arr, errors='coerce') | ||
result = tslib.array_to_datetime(arr, errors='coerce')[0] | ||
expected = np.array([tslib.iNaT], dtype='M8[ns]') | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
def test_coerce_outside_ns_bounds_one_valid(self): | ||
arr = np.array(['1/1/1000', '1/1/2000'], dtype=object) | ||
result = tslib.array_to_datetime(arr, errors='coerce') | ||
result = tslib.array_to_datetime(arr, errors='coerce')[0] | ||
expected = [tslib.iNaT, | ||
'2000-01-01T00:00:00.000000000-0000'] | ||
tm.assert_numpy_array_equal( | ||
|
@@ -123,11 +143,11 @@ def test_coerce_of_invalid_datetimes(self): | |
|
||
# Without coercing, the presence of any invalid dates prevents | ||
# any values from being converted | ||
result = tslib.array_to_datetime(arr, errors='ignore') | ||
result = tslib.array_to_datetime(arr, errors='ignore')[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like writing these like
|
||
tm.assert_numpy_array_equal(result, arr) | ||
|
||
# With coercing, the invalid dates becomes iNaT | ||
result = tslib.array_to_datetime(arr, errors='coerce') | ||
result = tslib.array_to_datetime(arr, errors='coerce')[0] | ||
expected = ['2013-01-01T00:00:00.000000000-0000', | ||
tslib.iNaT, | ||
tslib.iNaT] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This (pre-existing) pattern is pretty opaque to a first-time reader. What if instead of raising
TypeError
the fallback block became its own function that gets called from here?