From 510951bc289c91523cd67dffd65273d6610fa067 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 21 Jan 2025 10:38:07 +0800 Subject: [PATCH 1/8] ENH: Add test for to_datetime with scalar out of bounds value --- pandas/tests/tools/test_to_datetime.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 74b051aec71a4..119fcb4c07ec7 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1,6 +1,7 @@ """test to_datetime""" import calendar +import locale from collections import deque from datetime import ( date, @@ -9,12 +10,12 @@ timezone, ) from decimal import Decimal -import locale import zoneinfo -from dateutil.parser import parse import numpy as np +import pandas as pd import pytest +from dateutil.parser import parse from pandas._libs import tslib from pandas._libs.tslibs import ( @@ -29,8 +30,6 @@ import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_datetime64_ns_dtype - -import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -3689,3 +3688,22 @@ def test_to_datetime_wrapped_datetime64_ps(): ["1970-01-01 00:00:01.901901901"], dtype="datetime64[ns]", freq=None ) tm.assert_index_equal(result, expected) + + +def test_to_datetime_scalar_out_of_bounds(): + """Ensure pd.to_datetime raises an error for out-of-bounds scalar values.""" + uint64_max = np.iinfo("uint64").max + + # Expect an OverflowError when passing uint64_max as a scalar + with pytest.raises(OutOfBoundsDatetime): + pd.to_datetime(uint64_max, unit="ns") + + # Expect the same behavior when passing it as a list + with pytest.raises(OutOfBoundsDatetime): + pd.to_datetime([uint64_max], unit="ns") + + # Test a valid value (should not raise an error) + valid_timestamp = 1_700_000_000_000_000_000 # A reasonable nanosecond timestamp + result = pd.to_datetime(valid_timestamp, unit="ns") + assert isinstance(result, pd.Timestamp) + \ No newline at end of file From 16d14ab4b647e8f97317cc6c1d4d15d376032e18 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 21 Jan 2025 10:52:41 +0800 Subject: [PATCH 2/8] ENH: Fix out-of-bounds error handling in _to_datetime_with_unit --- pandas/core/tools/datetimes.py | 39 +++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 30487de7bafd5..20594c4eaee48 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -75,7 +75,6 @@ from pandas.core.construction import extract_array from pandas.core.indexes.base import Index from pandas.core.indexes.datetimes import DatetimeIndex - if TYPE_CHECKING: from collections.abc import ( Callable, @@ -479,41 +478,60 @@ def _array_strptime_with_fallback( return Index(result, dtype=result.dtype, name=name) -def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: + + + +def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> DatetimeIndex: """ - to_datetime specalized to the case where a 'unit' is passed. + to_datetime specialized to the case where a 'unit' is passed. + Fixes a bug where scalar out-of-bounds values were not raising + an error consistently. """ + # Ensure we handle both array-likes and scalars the same way. + # extract_array can return a scalar if 'arg' is scalar-like; + # so we force everything into at least 1D shape. arg = extract_array(arg, extract_numpy=True) + arg = np.atleast_1d(arg) # GH#30050 pass an ndarray to tslib.array_to_datetime # because it expects an ndarray argument if isinstance(arg, IntegerArray): + # For IntegerArray, we can directly convert arr = arg.astype(f"datetime64[{unit}]") tz_parsed = None + else: + # Now we have a guaranteed ndarray arg = np.asarray(arg) if arg.dtype.kind in "iu": # Note we can't do "f" here because that could induce unwanted - # rounding GH#14156, GH#20445 + # rounding GH#14156, GH#20445 arr = arg.astype(f"datetime64[{unit}]", copy=False) try: arr = astype_overflowsafe(arr, np.dtype("M8[ns]"), copy=False) except OutOfBoundsDatetime: if errors == "raise": raise + # errors != "raise" => coerce to object and retry arg = arg.astype(object) return _to_datetime_with_unit(arg, unit, name, utc, errors) tz_parsed = None elif arg.dtype.kind == "f": + # Floating dtypes with np.errstate(over="raise"): try: arr = cast_from_unit_vectorized(arg, unit=unit) except OutOfBoundsDatetime as err: if errors != "raise": + # coerce to object and retry return _to_datetime_with_unit( - arg.astype(object), unit, name, utc, errors + arg.astype(object), + unit, + name, + utc, + errors, ) raise OutOfBoundsDatetime( f"cannot convert input with unit '{unit}'" @@ -521,7 +539,9 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: arr = arr.view("M8[ns]") tz_parsed = None + else: + # Fallback: treat as object dtype arg = arg.astype(object, copy=False) arr, tz_parsed = tslib.array_to_datetime( arg, @@ -531,13 +551,10 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: creso=NpyDatetimeUnit.NPY_FR_ns.value, ) + # Construct a DatetimeIndex from the array result = DatetimeIndex(arr, name=name) - if not isinstance(result, DatetimeIndex): - return result - # GH#23758: We may still need to localize the result with tz - # GH#25546: Apply tz_parsed first (from arg), then tz (from caller) - # result will be naive but in UTC + # May need to localize result to parsed tz or convert to UTC if requested result = result.tz_localize("UTC").tz_convert(tz_parsed) if utc: @@ -545,9 +562,11 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: result = result.tz_localize("utc") else: result = result.tz_convert("utc") + return result + def _adjust_to_origin(arg, origin, unit): """ Helper function for to_datetime. From 4a2ad594ed767a134ff921027b353d0296dc2f7e Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 21 Jan 2025 11:37:03 +0800 Subject: [PATCH 3/8] ENH: Add debugging breakpoint in _to_datetime_with_unit for troubleshooting --- pandas/core/tools/datetimes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 20594c4eaee48..e3a81e295a998 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -487,6 +487,8 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> DatetimeI Fixes a bug where scalar out-of-bounds values were not raising an error consistently. """ + import pdb; pdb.set_trace() + # Ensure we handle both array-likes and scalars the same way. # extract_array can return a scalar if 'arg' is scalar-like; # so we force everything into at least 1D shape. From cb710ee42327bef0b4dd2641ba511df0e33e2a04 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 21 Jan 2025 11:55:00 +0800 Subject: [PATCH 4/8] ENH: Improve error handling for out-of-bounds uint64 values in _to_datetime_with_unit --- pandas/core/tools/datetimes.py | 62 ++++++++++++++------------ pandas/tests/tools/test_to_datetime.py | 18 ++++---- 2 files changed, 43 insertions(+), 37 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index e3a81e295a998..eebd4732ba958 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -75,6 +75,7 @@ from pandas.core.construction import extract_array from pandas.core.indexes.base import Index from pandas.core.indexes.datetimes import DatetimeIndex + if TYPE_CHECKING: from collections.abc import ( Callable, @@ -478,62 +479,70 @@ def _array_strptime_with_fallback( return Index(result, dtype=result.dtype, name=name) - - - def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> DatetimeIndex: """ to_datetime specialized to the case where a 'unit' is passed. - Fixes a bug where scalar out-of-bounds values were not raising - an error consistently. """ - import pdb; pdb.set_trace() - - # Ensure we handle both array-likes and scalars the same way. - # extract_array can return a scalar if 'arg' is scalar-like; - # so we force everything into at least 1D shape. arg = extract_array(arg, extract_numpy=True) + # Fix GH#60677 + # Ensure scalar and array-like both become arrays + # (so both paths use the same code). arg = np.atleast_1d(arg) # GH#30050 pass an ndarray to tslib.array_to_datetime # because it expects an ndarray argument if isinstance(arg, IntegerArray): - # For IntegerArray, we can directly convert arr = arg.astype(f"datetime64[{unit}]") tz_parsed = None - else: - # Now we have a guaranteed ndarray arg = np.asarray(arg) if arg.dtype.kind in "iu": # Note we can't do "f" here because that could induce unwanted - # rounding GH#14156, GH#20445 + # rounding GH#14156, GH#20445 + # Fix GH#60677 + # ------------------------------------------------ + # A) **Check for uint64 values above int64 max** + # so we don't accidentally wrap around to -1, etc. + # ------------------------------------------------ + if arg.dtype.kind == "u": # unsigned + above_max = arg > np.iinfo(np.int64).max + if above_max.any(): + if errors == "raise": + raise OutOfBoundsDatetime( + "Cannot convert uint64 values above" + f"{np.iinfo(np.int64).max}" + "to a 64-bit signed datetime64[ns]." + ) + else: + # For errors != "raise" (e.g. "coerce" or "ignore"), + # we can replace out-of-range entries with NaN (-> NaT), + # then switch to the fallback object path: + arg = arg.astype(object) + arg[above_max] = np.nan + return _to_datetime_with_unit(arg, unit, name, utc, errors) + + # ------------------------------------------------ + # B) Proceed with normal numeric -> datetime logic + # ------------------------------------------------ arr = arg.astype(f"datetime64[{unit}]", copy=False) try: arr = astype_overflowsafe(arr, np.dtype("M8[ns]"), copy=False) except OutOfBoundsDatetime: if errors == "raise": raise - # errors != "raise" => coerce to object and retry arg = arg.astype(object) return _to_datetime_with_unit(arg, unit, name, utc, errors) tz_parsed = None elif arg.dtype.kind == "f": - # Floating dtypes with np.errstate(over="raise"): try: arr = cast_from_unit_vectorized(arg, unit=unit) except OutOfBoundsDatetime as err: if errors != "raise": - # coerce to object and retry return _to_datetime_with_unit( - arg.astype(object), - unit, - name, - utc, - errors, + arg.astype(object), unit, name, utc, errors ) raise OutOfBoundsDatetime( f"cannot convert input with unit '{unit}'" @@ -541,9 +550,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> DatetimeI arr = arr.view("M8[ns]") tz_parsed = None - else: - # Fallback: treat as object dtype arg = arg.astype(object, copy=False) arr, tz_parsed = tslib.array_to_datetime( arg, @@ -553,10 +560,11 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> DatetimeI creso=NpyDatetimeUnit.NPY_FR_ns.value, ) - # Construct a DatetimeIndex from the array result = DatetimeIndex(arr, name=name) - # May need to localize result to parsed tz or convert to UTC if requested + # GH#23758: We may still need to localize the result with tz + # GH#25546: Apply tz_parsed first (from arg), then tz (from caller) + # result will be naive but in UTC result = result.tz_localize("UTC").tz_convert(tz_parsed) if utc: @@ -564,11 +572,9 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> DatetimeI result = result.tz_localize("utc") else: result = result.tz_convert("utc") - return result - def _adjust_to_origin(arg, origin, unit): """ Helper function for to_datetime. diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 119fcb4c07ec7..4740a9d719c56 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1,7 +1,6 @@ """test to_datetime""" import calendar -import locale from collections import deque from datetime import ( date, @@ -10,12 +9,12 @@ timezone, ) from decimal import Decimal +import locale import zoneinfo +from dateutil.parser import parse import numpy as np -import pandas as pd import pytest -from dateutil.parser import parse from pandas._libs import tslib from pandas._libs.tslibs import ( @@ -30,6 +29,8 @@ import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_datetime64_ns_dtype + +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -3688,7 +3689,7 @@ def test_to_datetime_wrapped_datetime64_ps(): ["1970-01-01 00:00:01.901901901"], dtype="datetime64[ns]", freq=None ) tm.assert_index_equal(result, expected) - + def test_to_datetime_scalar_out_of_bounds(): """Ensure pd.to_datetime raises an error for out-of-bounds scalar values.""" @@ -3696,14 +3697,13 @@ def test_to_datetime_scalar_out_of_bounds(): # Expect an OverflowError when passing uint64_max as a scalar with pytest.raises(OutOfBoundsDatetime): - pd.to_datetime(uint64_max, unit="ns") + to_datetime(uint64_max, unit="ns") # Expect the same behavior when passing it as a list with pytest.raises(OutOfBoundsDatetime): - pd.to_datetime([uint64_max], unit="ns") + to_datetime([uint64_max], unit="ns") # Test a valid value (should not raise an error) valid_timestamp = 1_700_000_000_000_000_000 # A reasonable nanosecond timestamp - result = pd.to_datetime(valid_timestamp, unit="ns") - assert isinstance(result, pd.Timestamp) - \ No newline at end of file + result = to_datetime(valid_timestamp, unit="ns") + assert isinstance(result, Timestamp) From b9b2d7ae19c6e6cb733cd2f3b11b9668ff1e64a8 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 21 Jan 2025 16:35:31 +0800 Subject: [PATCH 5/8] Add test for lower bound --- pandas/core/tools/datetimes.py | 5 +++++ pandas/tests/tools/test_to_datetime.py | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index eebd4732ba958..b14b90127eb24 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -482,6 +482,11 @@ def _array_strptime_with_fallback( def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> DatetimeIndex: """ to_datetime specialized to the case where a 'unit' is passed. + + Note: This function currently treats values at the upper bound differently + from values at the lower bound. + For upper bound, it raises OutOfBoundsDatetime. + For lower bound, it returns NaT. """ arg = extract_array(arg, extract_numpy=True) # Fix GH#60677 diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 4740a9d719c56..7901ac955b711 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3694,6 +3694,7 @@ def test_to_datetime_wrapped_datetime64_ps(): def test_to_datetime_scalar_out_of_bounds(): """Ensure pd.to_datetime raises an error for out-of-bounds scalar values.""" uint64_max = np.iinfo("uint64").max + int64_min = np.iinfo("int64").min # Expect an OverflowError when passing uint64_max as a scalar with pytest.raises(OutOfBoundsDatetime): @@ -3703,6 +3704,14 @@ def test_to_datetime_scalar_out_of_bounds(): with pytest.raises(OutOfBoundsDatetime): to_datetime([uint64_max], unit="ns") + # Expect NAT when passing int64_min as a scalar + value = to_datetime(int64_min, unit="ns") + assert value is NaT + + # Expect the same behavior when passing it as a list + value = to_datetime([int64_min], unit="ns") + assert value[0] is NaT + # Test a valid value (should not raise an error) valid_timestamp = 1_700_000_000_000_000_000 # A reasonable nanosecond timestamp result = to_datetime(valid_timestamp, unit="ns") From 4924a6d5f49258eae47fbb01007f22761c094c41 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 22 Jan 2025 10:13:39 +0800 Subject: [PATCH 6/8] ENH: Change return type of _to_datetime_with_unit to Index --- pandas/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index b14b90127eb24..71e78826ba09c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -479,7 +479,7 @@ def _array_strptime_with_fallback( return Index(result, dtype=result.dtype, name=name) -def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> DatetimeIndex: +def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: """ to_datetime specialized to the case where a 'unit' is passed. From 14562cefc7d1a76460539326b48ab1757bb513ad Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 31 Jan 2025 18:39:35 +0800 Subject: [PATCH 7/8] ENH: Add type check for DatetimeIndex in _to_datetime_with_unit --- pandas/core/tools/datetimes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 71e78826ba09c..e9d5baf0f3a2b 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -566,6 +566,8 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: ) result = DatetimeIndex(arr, name=name) + if not isinstance(result, DatetimeIndex): + return result # GH#23758: We may still need to localize the result with tz # GH#25546: Apply tz_parsed first (from arg), then tz (from caller) From a4ffcd30edec0aa57c57e864ad2005198dbf398c Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 7 Feb 2025 08:55:15 +0800 Subject: [PATCH 8/8] Added entry in v3.0.0.rst --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a7f63d75a047e..35594b23710e5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -640,6 +640,7 @@ Datetimelike - Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`) - Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) +- BUG: Ensure to_datetime raises errors for out-of-bounds scalar inputs (:issue:`60744`) Timedelta ^^^^^^^^^