diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 5b57baa9ec39a..b9b955acb757f 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -661,6 +661,8 @@ Datetimelike - Bug in ``pandas.tseries.holiday.Holiday`` where a half-open date interval causes inconsistent return types from :meth:`USFederalHolidayCalendar.holidays` (:issue:`49075`) - Bug in rendering :class:`DatetimeIndex` and :class:`Series` and :class:`DataFrame` with timezone-aware dtypes with ``dateutil`` or ``zoneinfo`` timezones near daylight-savings transitions (:issue:`49684`) - Bug in :func:`to_datetime` was raising ``ValueError`` when parsing :class:`Timestamp`, ``datetime``, or ``np.datetime64`` objects with non-ISO8601 ``format`` (:issue:`49298`, :issue:`50036`) +- Bug in :func:`to_datetime` with ``exact`` and ``format='%Y%m%d'`` wasn't raising if the input didn't match the format (:issue:`50051`) +- Bug in :func:`to_datetime` with ``errors='ignore'`` and ``format='%Y%m%d'`` was returning out-of-bounds inputs as ``datetime.datetime`` objects instead of returning the input (:issue:`50054`) - Timedelta diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi index db1388672b37c..d4287622ab0ab 100644 --- a/pandas/_libs/tslibs/parsing.pyi +++ b/pandas/_libs/tslibs/parsing.pyi @@ -27,11 +27,6 @@ def try_parse_dates( dayfirst: bool = ..., default: datetime | None = ..., ) -> npt.NDArray[np.object_]: ... -def try_parse_year_month_day( - years: npt.NDArray[np.object_], # object[:] - months: npt.NDArray[np.object_], # object[:] - days: npt.NDArray[np.object_], # object[:] -) -> npt.NDArray[np.object_]: ... def try_parse_datetime_components( years: npt.NDArray[np.object_], # object[:] months: npt.NDArray[np.object_], # object[:] diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 25a2722c48bd6..cdccccf1f0a5d 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -744,25 +744,6 @@ def try_parse_dates( return result.base # .base to access underlying ndarray -def try_parse_year_month_day( - object[:] years, object[:] months, object[:] days -) -> np.ndarray: - cdef: - Py_ssize_t i, n - object[::1] result - - n = len(years) - # TODO(cython3): Use len instead of `shape[0]` - if months.shape[0] != n or days.shape[0] != n: - raise ValueError("Length of years/months/days must all be equal") - result = np.empty(n, dtype="O") - - for i in range(n): - result[i] = datetime(int(years[i]), int(months[i]), int(days[i])) - - return result.base # .base to access underlying ndarray - - def try_parse_datetime_components(object[:] years, object[:] months, object[:] days, diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index fd0604903000e..3d0eaa4f993bd 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -24,8 +24,6 @@ Timedelta, Timestamp, iNaT, - nat_strings, - parsing, timezones as libtimezones, ) from pandas._libs.tslibs.parsing import ( @@ -38,7 +36,6 @@ AnyArrayLike, ArrayLike, DateTimeErrorChoices, - npt, ) from pandas.core.dtypes.common import ( @@ -57,13 +54,11 @@ ABCDataFrame, ABCSeries, ) -from pandas.core.dtypes.missing import notna from pandas.arrays import ( DatetimeArray, IntegerArray, ) -from pandas.core import algorithms from pandas.core.algorithms import unique from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.datetimes import ( @@ -407,7 +402,6 @@ def _convert_listlike_datetimes( # warn if passing timedelta64, raise for PeriodDtype # NB: this must come after unit transformation - orig_arg = arg try: arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz)) except TypeError: @@ -435,8 +429,8 @@ def _convert_listlike_datetimes( require_iso8601 = not infer_datetime_format if format is not None and not require_iso8601: - res = _to_datetime_with_format( - arg, orig_arg, name, utc, format, exact, errors, infer_datetime_format + res = _array_strptime_with_fallback( + arg, name, utc, format, exact, errors, infer_datetime_format ) if res is not None: return res @@ -510,43 +504,6 @@ def _array_strptime_with_fallback( return _box_as_indexlike(result, utc=utc, name=name) -def _to_datetime_with_format( - arg, - orig_arg, - name, - utc: bool, - fmt: str, - exact: bool, - errors: str, - infer_datetime_format: bool, -) -> Index | None: - """ - Try parsing with the given format, returning None on failure. - """ - result = None - - # shortcut formatting here - if fmt == "%Y%m%d": - # pass orig_arg as float-dtype may have been converted to - # datetime64[ns] - orig_arg = ensure_object(orig_arg) - try: - # may return None without raising - result = _attempt_YYYYMMDD(orig_arg, errors=errors) - except (ValueError, TypeError, OutOfBoundsDatetime) as err: - raise ValueError( - "cannot convert the input to '%Y%m%d' date format" - ) from err - if result is not None: - return _box_as_indexlike(result, utc=utc, name=name) - - # fallback - res = _array_strptime_with_fallback( - arg, name, utc, fmt, exact, errors, infer_datetime_format - ) - return res - - def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: """ to_datetime specalized to the case where a 'unit' is passed. @@ -973,7 +930,7 @@ def to_datetime( in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') - datetime.datetime(1300, 1, 1, 0, 0) + '13000101' >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT @@ -1241,60 +1198,6 @@ def coerce(values): return values -def _attempt_YYYYMMDD(arg: npt.NDArray[np.object_], errors: str) -> np.ndarray | None: - """ - try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, - arg is a passed in as an object dtype, but could really be ints/strings - with nan-like/or floats (e.g. with nan) - - Parameters - ---------- - arg : np.ndarray[object] - errors : {'raise','ignore','coerce'} - """ - - def calc(carg): - # calculate the actual result - carg = carg.astype(object, copy=False) - parsed = parsing.try_parse_year_month_day( - carg / 10000, carg / 100 % 100, carg % 100 - ) - return tslib.array_to_datetime(parsed, errors=errors)[0] - - def calc_with_mask(carg, mask): - result = np.empty(carg.shape, dtype="M8[ns]") - iresult = result.view("i8") - iresult[~mask] = iNaT - - masked_result = calc(carg[mask].astype(np.float64).astype(np.int64)) - result[mask] = masked_result.astype("M8[ns]") - return result - - # try intlike / strings that are ints - try: - return calc(arg.astype(np.int64)) - except (ValueError, OverflowError, TypeError): - pass - - # a float with actual np.nan - try: - carg = arg.astype(np.float64) - return calc_with_mask(carg, notna(carg)) - except (ValueError, OverflowError, TypeError): - pass - - # string with NaN-like - try: - # error: Argument 2 to "isin" has incompatible type "List[Any]"; expected - # "Union[Union[ExtensionArray, ndarray], Index, Series]" - mask = ~algorithms.isin(arg, list(nat_strings)) # type: ignore[arg-type] - return calc_with_mask(arg, mask) - except (ValueError, OverflowError, TypeError): - pass - - return None - - __all__ = [ "DateParseError", "should_cache", diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 0e9c8a3ce487b..5046b7adc8aef 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -117,6 +117,7 @@ def test_to_datetime_format_YYYYMMDD(self, cache): tm.assert_series_equal(result, expected) def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): + # GH50051 ser = Series([19801222, 19801222] + [19810105] * 5) # with NaT expected = Series( @@ -125,24 +126,21 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): expected[2] = np.nan ser[2] = np.nan - result = to_datetime(ser, format="%Y%m%d", cache=cache) - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError, match=None): + to_datetime(ser, format="%Y%m%d", cache=cache) # string with NaT ser2 = ser.apply(str) ser2[2] = "nat" - result = to_datetime(ser2, format="%Y%m%d", cache=cache) - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError, match=None): + to_datetime(ser2, format="%Y%m%d", cache=cache) def test_to_datetime_format_YYYYMMDD_ignore(self, cache): # coercion - # GH 7930 + # GH 7930, GH50054 ser = Series([20121231, 20141231, 99991231]) + expected = Series([20121231, 20141231, 99991231], dtype=object) result = to_datetime(ser, format="%Y%m%d", errors="ignore", cache=cache) - expected = Series( - [datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)], - dtype=object, - ) tm.assert_series_equal(result, expected) def test_to_datetime_format_YYYYMMDD_coercion(self, cache):