-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
POC/ENH: infer resolution in array_to_datetime #55741
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
11739a8
7ed4ea7
6ec542f
17865ad
3b88a1e
6e657ae
aa8ec80
2cdc788
8d2b5f3
ea626e4
e5ce66c
a75db4e
c19a840
948ab0e
ac6ce4d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,9 +31,14 @@ import numpy as np | |
|
||
cnp.import_array() | ||
|
||
from pandas._libs.tslibs.dtypes cimport ( | ||
get_supported_reso, | ||
npy_unit_to_abbrev, | ||
) | ||
from pandas._libs.tslibs.np_datetime cimport ( | ||
NPY_DATETIMEUNIT, | ||
NPY_FR_ns, | ||
get_datetime64_unit, | ||
import_pandas_datetime, | ||
npy_datetimestruct, | ||
npy_datetimestruct_to_datetime, | ||
|
@@ -441,6 +446,7 @@ cpdef array_to_datetime( | |
utc : bool, default False | ||
indicator whether the dates should be UTC | ||
creso : NPY_DATETIMEUNIT, default NPY_FR_ns | ||
Set to NPY_FR_GENERIC to infer a resolution. | ||
|
||
Returns | ||
------- | ||
|
@@ -464,14 +470,19 @@ cpdef array_to_datetime( | |
set out_tzoffset_vals = set() | ||
tzinfo tz_out = None | ||
cnp.flatiter it = cnp.PyArray_IterNew(values) | ||
DatetimeParseState state = DatetimeParseState() | ||
str reso_str | ||
NPY_DATETIMEUNIT item_reso | ||
bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC | ||
DatetimeParseState state = DatetimeParseState(creso) | ||
str abbrev | ||
|
||
# specify error conditions | ||
assert is_raise or is_ignore or is_coerce | ||
|
||
reso_str = npy_unit_to_abbrev(creso) | ||
result = np.empty((<object>values).shape, dtype=f"M8[{reso_str}]") | ||
if infer_reso: | ||
abbrev = "ns" | ||
else: | ||
abbrev = npy_unit_to_abbrev(creso) | ||
result = np.empty((<object>values).shape, dtype=f"M8[{abbrev}]") | ||
iresult = result.view("i8").ravel() | ||
|
||
for i in range(n): | ||
|
@@ -484,19 +495,38 @@ cpdef array_to_datetime( | |
iresult[i] = NPY_NAT | ||
|
||
elif PyDateTime_Check(val): | ||
if isinstance(val, _Timestamp): | ||
item_reso = val._creso | ||
else: | ||
item_reso = NPY_DATETIMEUNIT.NPY_FR_us | ||
state.update_creso(item_reso) | ||
if infer_reso: | ||
creso = state.creso | ||
tz_out = state.process_datetime(val, tz_out, utc_convert) | ||
iresult[i] = parse_pydatetime(val, &dts, creso=creso) | ||
|
||
elif PyDate_Check(val): | ||
item_reso = NPY_DATETIMEUNIT.NPY_FR_s | ||
state.update_creso(item_reso) | ||
if infer_reso: | ||
creso = state.creso | ||
iresult[i] = pydate_to_dt64(val, &dts, reso=creso) | ||
state.found_other = True | ||
|
||
elif is_datetime64_object(val): | ||
item_reso = get_supported_reso(get_datetime64_unit(val)) | ||
state.update_creso(item_reso) | ||
if infer_reso: | ||
creso = state.creso | ||
iresult[i] = get_datetime64_nanos(val, creso) | ||
state.found_other = True | ||
|
||
elif is_integer_object(val) or is_float_object(val): | ||
# these must be ns unit by-definition | ||
item_reso = NPY_FR_ns | ||
state.update_creso(item_reso) | ||
if infer_reso: | ||
creso = state.creso | ||
|
||
if val != val or val == NPY_NAT: | ||
iresult[i] = NPY_NAT | ||
|
@@ -514,11 +544,20 @@ cpdef array_to_datetime( | |
if parse_today_now(val, &iresult[i], utc, creso): | ||
# We can't _quite_ dispatch this to convert_str_to_tsobject | ||
# bc there isn't a nice way to pass "utc" | ||
item_reso = NPY_DATETIMEUNIT.NPY_FR_us | ||
state.update_creso(item_reso) | ||
if infer_reso: | ||
creso = state.creso | ||
continue | ||
|
||
_ts = convert_str_to_tsobject( | ||
val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst | ||
) | ||
item_reso = _ts.creso | ||
state.update_creso(item_reso) | ||
if infer_reso: | ||
creso = state.creso | ||
|
||
_ts.ensure_reso(creso, val) | ||
|
||
iresult[i] = _ts.value | ||
|
@@ -586,6 +625,23 @@ cpdef array_to_datetime( | |
# e.g. test_to_datetime_mixed_awareness_mixed_types | ||
raise ValueError("Cannot mix tz-aware with tz-naive values") | ||
|
||
if infer_reso: | ||
if state.creso_ever_changed: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure what to do here but my first read of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
we do not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any reason to keep it as part of the state then instead of just local to the function? To be clear not a hold up on this PR for me. Just a consideration point as this continues to evolve There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. bc it gets set inside a state-updating method that i dont want to duplicate in a bunch of places. also there are a couple of different places where we use DatetimeParseState and im trying to iron out the kinks in behavior differences between them. |
||
# We encountered mismatched resolutions, need to re-parse with | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# the correct one. | ||
return array_to_datetime( | ||
values, | ||
errors=errors, | ||
yearfirst=yearfirst, | ||
dayfirst=dayfirst, | ||
utc=utc, | ||
creso=state.creso, | ||
) | ||
|
||
# Otherwise we can use the single reso that we encountered and avoid | ||
# a second pass. | ||
abbrev = npy_unit_to_abbrev(state.creso) | ||
result = iresult.view(f"M8[{abbrev}]") | ||
return result, tz_out | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,66 @@ | |
creso_infer = NpyDatetimeUnit.NPY_FR_GENERIC.value | ||
|
||
|
||
class TestArrayToDatetimeResolutionInference: | ||
# TODO: tests that include tzs, ints | ||
|
||
def test_infer_homogeoneous_datetimes(self): | ||
dt = datetime(2023, 10, 27, 18, 3, 5, 678000) | ||
arr = np.array([dt, dt, dt], dtype=object) | ||
result, tz = tslib.array_to_datetime(arr, creso=creso_infer) | ||
assert tz is None | ||
expected = np.array([dt, dt, dt], dtype="M8[us]") | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
def test_infer_homogeoneous_date_objects(self): | ||
dt = datetime(2023, 10, 27, 18, 3, 5, 678000) | ||
dt2 = dt.date() | ||
arr = np.array([None, dt2, dt2, dt2], dtype=object) | ||
result, tz = tslib.array_to_datetime(arr, creso=creso_infer) | ||
assert tz is None | ||
expected = np.array([np.datetime64("NaT"), dt2, dt2, dt2], dtype="M8[s]") | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
def test_infer_homogeoneous_dt64(self): | ||
dt = datetime(2023, 10, 27, 18, 3, 5, 678000) | ||
dt64 = np.datetime64(dt, "ms") | ||
arr = np.array([None, dt64, dt64, dt64], dtype=object) | ||
result, tz = tslib.array_to_datetime(arr, creso=creso_infer) | ||
assert tz is None | ||
expected = np.array([np.datetime64("NaT"), dt64, dt64, dt64], dtype="M8[ms]") | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
def test_infer_homogeoneous_timestamps(self): | ||
dt = datetime(2023, 10, 27, 18, 3, 5, 678000) | ||
ts = Timestamp(dt).as_unit("ns") | ||
arr = np.array([None, ts, ts, ts], dtype=object) | ||
result, tz = tslib.array_to_datetime(arr, creso=creso_infer) | ||
assert tz is None | ||
expected = np.array([np.datetime64("NaT")] + [ts.asm8] * 3, dtype="M8[ns]") | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
def test_infer_homogeoneous_datetimes_strings(self): | ||
item = "2023-10-27 18:03:05.678000" | ||
arr = np.array([None, item, item, item], dtype=object) | ||
result, tz = tslib.array_to_datetime(arr, creso=creso_infer) | ||
assert tz is None | ||
expected = np.array([np.datetime64("NaT"), item, item, item], dtype="M8[us]") | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
def test_infer_heterogeneous(self): | ||
dtstr = "2023-10-27 18:03:05.678000" | ||
|
||
arr = np.array([dtstr, dtstr[:-3], dtstr[:-7], None], dtype=object) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In a follow up, it would be good to also add a test for different objects too e.g. |
||
result, tz = tslib.array_to_datetime(arr, creso=creso_infer) | ||
assert tz is None | ||
expected = np.array(arr, dtype="M8[us]") | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
result, tz = tslib.array_to_datetime(arr[::-1], creso=creso_infer) | ||
assert tz is None | ||
tm.assert_numpy_array_equal(result, expected[::-1]) | ||
|
||
|
||
class TestArrayToDatetimeWithTZResolutionInference: | ||
def test_array_to_datetime_with_tz_resolution(self): | ||
tz = tzoffset("custom", 3600) | ||
|
Uh oh!
There was an error while loading. Please reload this page.