diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 099e5bc48353a..e4fc0d0471727 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -907,6 +907,7 @@ Other - Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`) - Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`) - Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`) +- Bug in input validation when coercing object-dtype arrays containing ambiguous datetime strings to ``datetime64`` that could result in silently inconsistent parsing. (:issue:`61353``) .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6aa5062b8ed86..a345faa6ec68b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2177,6 +2177,40 @@ def maybe_coerce_values(values: ArrayLike) -> ArrayLike: if isinstance(values, np.ndarray): values = ensure_wrapped_if_datetimelike(values) + _date_like_re = re.compile(r"\d{1,4}[/\-]\d{1,2}[/\-]\d{1,4}") + if ( + values.dtype == object + and values.ndim == 1 + and len(values) > 0 + and all(isinstance(x, str) and _date_like_re.match(x) for x in values) + ): + print("[DEBUG] matched ambiguous datetime regex:", values) + from pandas.core.tools.datetimes import ( + _guess_datetime_format_for_array, + to_datetime, + ) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + fmt = _guess_datetime_format_for_array(values) + + if fmt is None: + raise ValueError( + "Ambiguous datetime string format detected. " + "Specify a format via `pd.to_datetime(..., format=...)` " + "or use `dayfirst=True`." + ) + + try: + # Validate consistent parsing + to_datetime(values, format=fmt, dayfirst=False) + except ValueError: + raise ValueError( + "Inconsistent or ambiguous datetime strings detected. " + "Specify `format=...` " + "or use `dayfirst=True` to ensure correct parsing." + ) from None + if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 20dd7b0c4d3e7..2922e8eedad8c 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -816,6 +816,18 @@ def test_setitem_index_object_dtype_not_inferring(self): ) tm.assert_frame_equal(df, expected) + def test_setitem_with_ambiguous_datetime_strings_raises(self): + df = DataFrame({"a": date_range("2020-01-01", periods=2)}) + with pytest.raises( + ValueError, + match=( + "(?i)ambiguous datetime string format|" + "inconsistent or ambiguous datetime strings" + ), + ): + ambiguous_dates = Series(["12/01/2020", "13/01/2020"], dtype=object) + df.loc[:, "a"] = ambiguous_dates + class TestSetitemTZAwareValues: @pytest.fixture @@ -1399,3 +1411,15 @@ def test_setitem_partial_row_multiple_columns(): } ) tm.assert_frame_equal(df, expected) + + +def test_constructor_with_ambiguous_datetime_strings_raises(): + with pytest.raises( + ValueError, + match=( + "(?i)ambiguous datetime string format|" + "inconsistent or ambiguous datetime strings" + ), + ): + df = DataFrame({"a": Series(["12/01/2020", "13/01/2020"], dtype="object")}) + df.astype({"a": "datetime64[ns]"})