Skip to content

Commit ad93898

Browse files
committed
ENH: infer resolution in to_datetime, DatetimeIndex
1 parent d5ad65c commit ad93898

File tree

130 files changed

+1236
-710
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

130 files changed

+1236
-710
lines changed

pandas/_libs/lib.pyx

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -123,16 +123,12 @@ from pandas._libs.missing cimport (
123123
is_null_datetime64,
124124
is_null_timedelta64,
125125
)
126-
from pandas._libs.tslibs.conversion cimport (
127-
_TSObject,
128-
convert_to_tsobject,
129-
)
126+
from pandas._libs.tslibs.conversion cimport convert_to_tsobject
130127
from pandas._libs.tslibs.nattype cimport (
131128
NPY_NAT,
132129
c_NaT as NaT,
133130
checknull_with_nat,
134131
)
135-
from pandas._libs.tslibs.np_datetime cimport NPY_FR_ns
136132
from pandas._libs.tslibs.offsets cimport is_offset_object
137133
from pandas._libs.tslibs.period cimport is_period_object
138134
from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64
@@ -2523,7 +2519,6 @@ def maybe_convert_objects(ndarray[object] objects,
25232519
ndarray[uint8_t] mask
25242520
Seen seen = Seen()
25252521
object val
2526-
_TSObject tsobj
25272522
float64_t fnan = NaN
25282523

25292524
if dtype_if_all_nat is not None:
@@ -2630,9 +2625,9 @@ def maybe_convert_objects(ndarray[object] objects,
26302625
else:
26312626
seen.datetime_ = True
26322627
try:
2633-
tsobj = convert_to_tsobject(val, None, None, 0, 0)
2634-
tsobj.ensure_reso(NPY_FR_ns)
2628+
convert_to_tsobject(val, None, None, 0, 0)
26352629
except OutOfBoundsDatetime:
2630+
# e.g. test_out_of_s_bounds_datetime64
26362631
seen.object_ = True
26372632
break
26382633
else:

pandas/_libs/tslib.pyx

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,10 @@ from pandas._libs.tslibs.conversion cimport (
7070
get_datetime64_nanos,
7171
parse_pydatetime,
7272
)
73-
from pandas._libs.tslibs.dtypes cimport npy_unit_to_abbrev
73+
from pandas._libs.tslibs.dtypes cimport (
74+
get_supported_reso,
75+
npy_unit_to_abbrev,
76+
)
7477
from pandas._libs.tslibs.nattype cimport (
7578
NPY_NAT,
7679
c_NaT as NaT,
@@ -417,7 +420,7 @@ cpdef array_to_datetime(
417420
bint dayfirst=False,
418421
bint yearfirst=False,
419422
bint utc=False,
420-
NPY_DATETIMEUNIT creso=NPY_FR_ns,
423+
NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_GENERIC,
421424
):
422425
"""
423426
Converts a 1D array of date-like values to a numpy array of either:
@@ -444,18 +447,20 @@ cpdef array_to_datetime(
444447
yearfirst parsing behavior when encountering datetime strings
445448
utc : bool, default False
446449
indicator whether the dates should be UTC
447-
creso : NPY_DATETIMEUNIT, default NPY_FR_ns
448-
Set to NPY_FR_GENERIC to infer a resolution.
450+
creso : NPY_DATETIMEUNIT, default NPY_FR_GENERIC
451+
If NPY_FR_GENERIC, conduct inference.
449452
450453
Returns
451454
-------
452455
np.ndarray
453456
May be datetime64[ns] or object dtype
454457
tzinfo or None
458+
str
459+
Inferred resolution
455460
"""
456461
cdef:
457462
Py_ssize_t i, n = values.size
458-
object val, tz
463+
object val
459464
ndarray[int64_t] iresult
460465
npy_datetimestruct dts
461466
bint utc_convert = bool(utc)
@@ -467,7 +472,7 @@ cpdef array_to_datetime(
467472
_TSObject _ts
468473
float tz_offset
469474
set out_tzoffset_vals = set()
470-
tzinfo tz_out = None
475+
tzinfo tz, tz_out = None
471476
cnp.flatiter it = cnp.PyArray_IterNew(values)
472477
NPY_DATETIMEUNIT item_reso
473478
bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC
@@ -522,15 +527,14 @@ cpdef array_to_datetime(
522527

523528
elif is_integer_object(val) or is_float_object(val):
524529
# these must be ns unit by-definition
525-
item_reso = NPY_FR_ns
526-
state.update_creso(item_reso)
527-
if infer_reso:
528-
creso = state.creso
529530

530531
if val != val or val == NPY_NAT:
531532
iresult[i] = NPY_NAT
532533
else:
533-
# we now need to parse this as if unit='ns'
534+
item_reso = NPY_FR_ns
535+
state.update_creso(item_reso)
536+
if infer_reso:
537+
creso = state.creso
534538
iresult[i] = cast_from_unit(val, "ns", out_reso=creso)
535539
state.found_other = True
536540

@@ -552,6 +556,14 @@ cpdef array_to_datetime(
552556
_ts = convert_str_to_tsobject(
553557
val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst
554558
)
559+
560+
if _ts.value == NPY_NAT:
561+
# e.g. "NaT" string or empty string, we do not consider
562+
# this as either tzaware or tznaive. See
563+
# test_to_datetime_with_empty_str_utc_false_format_mixed
564+
iresult[i] = _ts.value
565+
continue
566+
555567
item_reso = _ts.creso
556568
state.update_creso(item_reso)
557569
if infer_reso:
@@ -561,12 +573,7 @@ cpdef array_to_datetime(
561573
iresult[i] = _ts.value
562574

563575
tz = _ts.tzinfo
564-
if _ts.value == NPY_NAT:
565-
# e.g. "NaT" string or empty string, we do not consider
566-
# this as either tzaware or tznaive. See
567-
# test_to_datetime_with_empty_str_utc_false_format_mixed
568-
pass
569-
elif tz is not None:
576+
if tz is not None:
570577
# dateutil timezone objects cannot be hashed, so
571578
# store the UTC offsets in seconds instead
572579
nsecs = tz.utcoffset(None).total_seconds()
@@ -639,7 +646,7 @@ cpdef array_to_datetime(
639646
# Otherwise we can use the single reso that we encountered and avoid
640647
# a second pass.
641648
abbrev = npy_unit_to_abbrev(state.creso)
642-
result = iresult.view(f"M8[{abbrev}]")
649+
result = iresult.view(f"M8[{abbrev}]").reshape(result.shape)
643650
return result, tz_out
644651

645652

@@ -803,7 +810,9 @@ def array_to_datetime_with_tz(
803810
if state.creso_ever_changed:
804811
# We encountered mismatched resolutions, need to re-parse with
805812
# the correct one.
806-
return array_to_datetime_with_tz(values, tz=tz, creso=creso)
813+
return array_to_datetime_with_tz(
814+
values, tz=tz, dayfirst=dayfirst, yearfirst=yearfirst, creso=creso
815+
)
807816

808817
# Otherwise we can use the single reso that we encountered and avoid
809818
# a second pass.

pandas/_libs/tslibs/conversion.pyx

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -710,7 +710,9 @@ cdef int64_t parse_pydatetime(
710710
result = _ts.value
711711
else:
712712
if isinstance(val, _Timestamp):
713-
result = (<_Timestamp>val)._as_creso(creso, round_ok=False)._value
713+
# TODO: make round_ok an arg here that is True if a user passed a
714+
# dtype to DatetimeIndex/astype and False if we are doing inference?
715+
result = (<_Timestamp>val)._as_creso(creso, round_ok=True)._value
714716
else:
715717
result = pydatetime_to_dt64(val, dts, reso=creso)
716718
check_dts_bounds(dts, creso)

pandas/_libs/tslibs/strptime.pyx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ cdef _get_format_regex(str fmt):
241241

242242

243243
cdef class DatetimeParseState:
244-
def __cinit__(self, NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns):
244+
def __cinit__(self, NPY_DATETIMEUNIT creso):
245245
# found_tz and found_naive are specifically about datetime/Timestamp
246246
# objects with and without tzinfos attached.
247247
self.found_tz = False
@@ -295,7 +295,7 @@ def array_strptime(
295295
bint exact=True,
296296
errors="raise",
297297
bint utc=False,
298-
NPY_DATETIMEUNIT creso=NPY_FR_ns,
298+
NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_GENERIC,
299299
):
300300
"""
301301
Calculates the datetime structs represented by the passed array of strings
@@ -621,7 +621,7 @@ cdef tzinfo _parse_with_format(
621621
elif len(s) <= 6:
622622
item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_us
623623
else:
624-
item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_ns
624+
item_reso[0] = NPY_FR_ns
625625
# Pad to always return nanoseconds
626626
s += "0" * (9 - len(s))
627627
us = long(s)

pandas/_testing/asserters.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -970,6 +970,9 @@ def assert_series_equal(
970970

971971
# datetimelike may have different objects (e.g. datetime.datetime
972972
# vs Timestamp) but will compare equal
973+
# TODO: this works for object-vs-dt64 but not e.g. dt64[ns] vs dt64[us],
974+
# which AFAICT would have been intended at the time
975+
# check_datetimelike_compat was implemented, xref GH#55638
973976
if not Index(left._values).equals(Index(right._values)):
974977
msg = (
975978
f"[datetimelike_compat=True] {left._values} "

pandas/core/arrays/datetimes.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2229,9 +2229,11 @@ def _sequence_to_dt64(
22292229
data, copy = maybe_convert_dtype(data, copy, tz=tz)
22302230
data_dtype = getattr(data, "dtype", None)
22312231

2232-
if out_unit is None:
2233-
out_unit = "ns"
2234-
out_dtype = np.dtype(f"M8[{out_unit}]")
2232+
out_dtype = DT64NS_DTYPE
2233+
out_reso = abbrev_to_npy_unit(None) # NPY_FR_GENERIC
2234+
if out_unit is not None:
2235+
out_dtype = np.dtype(f"M8[{out_unit}]")
2236+
out_reso = abbrev_to_npy_unit(out_unit)
22352237

22362238
if data_dtype == object or is_string_dtype(data_dtype):
22372239
# TODO: We do not have tests specific to string-dtypes,
@@ -2255,19 +2257,17 @@ def _sequence_to_dt64(
22552257
dayfirst=dayfirst,
22562258
yearfirst=yearfirst,
22572259
allow_object=False,
2258-
out_unit=out_unit or "ns",
2260+
out_reso=out_reso,
22592261
)
22602262
copy = False
22612263
if tz and inferred_tz:
22622264
# two timezones: convert to intended from base UTC repr
22632265
# GH#42505 by convention, these are _already_ UTC
2264-
assert converted.dtype == out_dtype, converted.dtype
2265-
result = converted.view(out_dtype)
2266+
result = converted
22662267

22672268
elif inferred_tz:
22682269
tz = inferred_tz
2269-
assert converted.dtype == out_dtype, converted.dtype
2270-
result = converted.view(out_dtype)
2270+
result = converted
22712271

22722272
else:
22732273
result, _ = _construct_from_dt64_naive(
@@ -2365,7 +2365,7 @@ def objects_to_datetime64(
23652365
utc: bool = False,
23662366
errors: DateTimeErrorChoices = "raise",
23672367
allow_object: bool = False,
2368-
out_unit: str = "ns",
2368+
out_reso: int = 14,
23692369
):
23702370
"""
23712371
Convert data to array of timestamps.
@@ -2381,7 +2381,9 @@ def objects_to_datetime64(
23812381
allow_object : bool
23822382
Whether to return an object-dtype ndarray instead of raising if the
23832383
data contains more than one timezone.
2384-
out_unit : str, default "ns"
2384+
out_reso : int, default 14
2385+
14 corresponds to NPY_FR_GENERIC, which indicates to infer
2386+
a resolution.
23852387
23862388
Returns
23872389
-------
@@ -2407,7 +2409,7 @@ def objects_to_datetime64(
24072409
utc=utc,
24082410
dayfirst=dayfirst,
24092411
yearfirst=yearfirst,
2410-
creso=abbrev_to_npy_unit(out_unit),
2412+
creso=out_reso,
24112413
)
24122414

24132415
if tz_parsed is not None:

pandas/core/series.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3483,6 +3483,7 @@ def combine_first(self, other) -> Series:
34833483
other = other.reindex(keep_other, copy=False)
34843484

34853485
if this.dtype.kind == "M" and other.dtype.kind != "M":
3486+
# TODO: try to match resos?
34863487
other = to_datetime(other)
34873488
combined = concat([this, other])
34883489
combined = combined.reindex(new_index, copy=False)

pandas/core/tools/datetimes.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -497,7 +497,8 @@ def _convert_listlike_datetimes(
497497
if tz_parsed is not None:
498498
# We can take a shortcut since the datetime64 numpy array
499499
# is in UTC
500-
dtype = cast(DatetimeTZDtype, tz_to_dtype(tz_parsed))
500+
out_unit = np.datetime_data(result.dtype)[0]
501+
dtype = cast(DatetimeTZDtype, tz_to_dtype(tz_parsed, out_unit))
501502
dt64_values = result.view(f"M8[{dtype.unit}]")
502503
dta = DatetimeArray._simple_new(dt64_values, dtype=dtype)
503504
return DatetimeIndex._simple_new(dta, name=name)

pandas/core/window/ewm.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import numpy as np
99

10+
from pandas._libs import lib
1011
from pandas._libs.tslibs import Timedelta
1112
import pandas._libs.window.aggregations as window_aggregations
1213
from pandas.util._decorators import doc
@@ -364,7 +365,15 @@ def __init__(
364365
if not self.adjust:
365366
raise NotImplementedError("times is not supported with adjust=False.")
366367
if not is_datetime64_ns_dtype(self.times):
367-
raise ValueError("times must be datetime64[ns] dtype.")
368+
if hasattr(self.times, "dtype") and lib.is_np_dtype(
369+
self.times.dtype, "M"
370+
):
371+
# TODO: is this what we really want?
372+
from pandas import Series
373+
374+
self.times = Series(self.times).dt.as_unit("ns")._values
375+
else:
376+
raise ValueError("times must be datetime64[ns] dtype.")
368377
if len(self.times) != len(obj):
369378
raise ValueError("times must be the same length as the object.")
370379
if not isinstance(self.halflife, (str, datetime.timedelta, np.timedelta64)):

pandas/tests/apply/test_frame_apply.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1272,7 +1272,7 @@ def test_nuiscance_columns():
12721272

12731273
result = df.agg(["min"])
12741274
expected = DataFrame(
1275-
[[1, 1.0, "bar", Timestamp("20130101")]],
1275+
[[1, 1.0, "bar", Timestamp("20130101").as_unit("ns")]],
12761276
index=["min"],
12771277
columns=df.columns,
12781278
)

pandas/tests/apply/test_series_apply.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def func(x):
153153
def test_apply_box():
154154
# ufunc will not be boxed. Same test cases as the test_map_box
155155
vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]
156-
s = Series(vals)
156+
s = Series(vals, dtype="M8[ns]")
157157
assert s.dtype == "datetime64[ns]"
158158
# boxed value must be Timestamp instance
159159
res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat")
@@ -164,7 +164,7 @@ def test_apply_box():
164164
pd.Timestamp("2011-01-01", tz="US/Eastern"),
165165
pd.Timestamp("2011-01-02", tz="US/Eastern"),
166166
]
167-
s = Series(vals)
167+
s = Series(vals, dtype="M8[ns, US/Eastern]")
168168
assert s.dtype == "datetime64[ns, US/Eastern]"
169169
res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat")
170170
exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"])

pandas/tests/arithmetic/test_datetime64.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1286,7 +1286,7 @@ def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array):
12861286
["2010-11-01 05:00", "2010-11-01 06:00", "2010-11-01 07:00"],
12871287
freq="h",
12881288
tz=tz,
1289-
)
1289+
).as_unit("ns")
12901290

12911291
dates = tm.box_expected(dates, box_with_array)
12921292
expected = tm.box_expected(expected, box_with_array)
@@ -1580,7 +1580,7 @@ def test_dti_add_sub_nonzero_mth_offset(
15801580
mth = getattr(date, op)
15811581
result = mth(offset)
15821582

1583-
expected = DatetimeIndex(exp, tz=tz)
1583+
expected = DatetimeIndex(exp, tz=tz).as_unit("ns")
15841584
expected = tm.box_expected(expected, box_with_array, False)
15851585
tm.assert_equal(result, expected)
15861586

@@ -2286,7 +2286,7 @@ def test_dti_add_series(self, tz_naive_fixture, names):
22862286
tz = tz_naive_fixture
22872287
index = DatetimeIndex(
22882288
["2016-06-28 05:30", "2016-06-28 05:31"], tz=tz, name=names[0]
2289-
)
2289+
).as_unit("ns")
22902290
ser = Series([Timedelta(seconds=5)] * 2, index=index, name=names[1])
22912291
expected = Series(index + Timedelta(seconds=5), index=index, name=names[2])
22922292

0 commit comments

Comments
 (0)