From bd4061cc59561d4ff1cda0af1486dd933e664e27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sylvain=20Mari=C3=A9?= Date: Mon, 12 Jul 2021 10:04:24 +0200 Subject: [PATCH 01/27] Update datetimes.py Following https://github.com/pandas-dev/pandas/pull/42244 , improved documentation about datetime parsing. See also https://github.com/pandas-dev/pandas/issues/42229#issuecomment-870009579 --- pandas/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 26349a3b2c6c1..8ef519ce7eb69 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -763,7 +763,7 @@ def to_datetime( Return type depends on input: - list-like: - - DatetimeIndex, if timezone naive or aware with the same timezone + - DatetimeIndex, if timezone naive or aware with constant time offset - Index of object dtype, if timezone aware with mixed time offsets - Series: Series of datetime64 dtype - scalar: Timestamp From 66c725d5f1668b46fde8e59341e74c8f7c2b3652 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 4 Oct 2021 18:45:32 +0200 Subject: [PATCH 02/27] from code review: improved utc doc --- pandas/core/tools/datetimes.py | 37 +++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 8ef519ce7eb69..af7387ba9eb69 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -714,8 +714,34 @@ def to_datetime( Warning: yearfirst=True is not strict, but will prefer to parse with year first (this is a known bug, based on dateutil behavior). utc : bool, default None - Return UTC DatetimeIndex if True (converting any tz-aware - datetime.datetime objects as well). + Control timezone localization and conversion. + + - if True, returns a timezone-aware UTC-localized Timestamp, Series or + DatetimeIndex. Any tz-naive element will be *localized* as UTC. + Any already tz-aware input element (e.g. timezone-aware + datetime.datetime object, or datetime string with explicit timezone + offset) will be *converted* to UTC. + + - If False (default), for scalar inputs, the result will be a + timezone-aware Timestamp if the scalar is timezone-aware, otherwise + it will be a timezone-naive Timestamp. + For multiple inputs (list, series): + + - Tz-aware datetime.datetime inputs are not supported (raise + ValueError). + - The result will be a timezone-aware Series or DatetimeIndex + ONLY if all time offsets in string datetime inputs are + identical. + - If all inputs are timezone-naive, the result will be + timezone-naive. + - In other cases, for example if the time offset is + not identical in all string entries, the result will be an Index + of dtype object. + + See pandas general documentation about timezone conversion and + localization: + https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-zone-handling + format : str, default None The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. @@ -763,8 +789,9 @@ def to_datetime( Return type depends on input: - list-like: - - DatetimeIndex, if timezone naive or aware with constant time offset - - Index of object dtype, if timezone aware with mixed time offsets + - DatetimeIndex, if timezone naive or aware with constant time + offset. + - Index of object dtype, if timezone aware with mixed time offsets. - Series: Series of datetime64 dtype - scalar: Timestamp @@ -842,7 +869,7 @@ def to_datetime( dtype='datetime64[ns]', freq=None) In case input is list-like and the elements of input are of mixed - timezones, return will have object type Index if utc=False. + time offsets, return will have object type Index if utc=False. >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500']) Index([2018-10-26 12:00:00-05:30, 2018-10-26 12:00:00-05:00], dtype='object') From f5cbef8bb1e9c92288c72a20ac67bed21d954c8c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 5 Oct 2021 11:57:25 +0200 Subject: [PATCH 03/27] Improved overall readability by - adding an extended summary with details, instead of putting all details in the arguments and outputs description. - Fixing return type hints to add the type hints for datetime.datetime scalars or array-like. - adding an explicit "raises" section - adding illustrative examples --- pandas/core/tools/datetimes.py | 154 ++++++++++++++++++++++++++++----- 1 file changed, 132 insertions(+), 22 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index e4d03187e7551..e8f229a56ba83 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -685,10 +685,51 @@ def to_datetime( infer_datetime_format: bool = False, origin="unix", cache: bool = True, -) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None: +) -> DatetimeIndex | Index | Series | Timestamp | DatetimeScalar | NaTType | None: """ Convert argument to datetime. + This function converts a scalar, array-like, :class:`Series` or + :class:`DataFrame`/dict-like to a pandas datetime object. + + - scalars can be int, float, str, datetime object (from stdlib datetime + module or numpy). They are converted to :class:`Timestamp` when possible, + otherwise they are converted to ``datetime.datetime``. None/NaN/null + scalars are converted to ``NaT``. + + - array-like can contain int, float, str, datetime objects. They are + converted to :class:`DatetimeIndex` when possible, otherwise they are + converted to :class:`Index` with object dtype, containing + ``datetime.datetime``. None/NaN/null entries are converted to ``NaT`` in + both cases. + + - :class:`Series` are converted to :class:`Series` with datetime64 dtype + when possible, otherwise they are converted to :class:`Series` with + object dtype, containing ``datetime.datetime``. None/NaN/null entries + are converted to ``NaT`` in both cases. + + - :class:`DataFrame`/dict-like are converted to :class:`Series` with + datetime64 dtype. For each row a datetime is created from assembling + the various dataframe columns. Column keys can be common abbreviations + like [‘year’, ‘month’, ‘day’, ‘minute’, ‘second’, ‘ms’, ‘us’, ‘ns’]) or + plurals of the same. + + The following causes are responsible for datetime.datetime objects being + returned (possibly inside an Index or a Series with object dtype) instead + of a proper pandas designated type (Timestamp, DatetimeIndex or Series + with datetime64 dtype): + + - when any input element is before Timestamp.min or after Timestamp.max, + see `timestamp limitations + `_. + + - when utc=False (default) and the input is an array-like or Series + containing mixed naive/aware datetime, or aware with mixed time offsets. + Note that this happens in the (quite frequent) situation when the + timezone has a daylight savings policy. In that case you may wish to + use utc=True. + Parameters ---------- arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like @@ -747,15 +788,16 @@ def to_datetime( not identical in all string entries, the result will be an Index of dtype object. - See pandas general documentation about timezone conversion and - localization: - https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-zone-handling + See pandas general documentation about `timezone conversion and + localization + `_. format : str, default None The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse - all the way up to nanoseconds. - See strftime documentation for more information on choices: - https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. + all the way up to nanoseconds. See `strftime documentation + `_ for more information on choices. exact : bool, True by default Behaves as: - If True, require an exact format match. @@ -797,17 +839,25 @@ def to_datetime( If parsing succeeded. Return type depends on input: - - list-like: - - DatetimeIndex, if timezone naive or aware with constant time - offset. - - Index of object dtype, if timezone aware with mixed time offsets. - - Series: Series of datetime64 dtype + - array-like: DatetimeIndex + - Series or DataFrame: Series of datetime64 dtype - scalar: Timestamp - In case when it is not possible to return designated types (e.g. when - any element of input is before Timestamp.min or after Timestamp.max) - return will have datetime.datetime type (or corresponding - array/Series). + Warning: in some situations the return type can not be one of the above + and is is rather datetime.datetime (scalar input) or Series with object + dtype containing datetime.datetime objects (array-like or Series + input). See above documentation for details, as well as examples + below. + + Raises + ------ + ParserError + When parsing a date from string fails. + ValueError + When another datetime conversion error happens. For example when one + of 'year', 'month', day' is missing in a :class:`DataFrame`, or when + a Tz-aware datetime.datetime is found in an array-like of mixed time + offsets, and utc=False. See Also -------- @@ -877,16 +927,76 @@ def to_datetime( DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None) - In case input is list-like and the elements of input are of mixed - time offsets, return will have object type Index if utc=False. + .. warning:: By default (utc=False), all items in an input array must + either be all tz-naive, or all tz-aware with the same offset. Mixed + offsets result in datetime.datetime objects being returned instead, + see examples below. + + Default (utc=False) and tz-naive returns tz-naive DatetimeIndex: + + >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15']) + DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], \ + dtype='datetime64[ns]', freq=None) + + Default (utc=False) and tz-aware with constant offset returns tz-aware + DatetimeIndex: + + >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500']) + DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], \ + dtype='datetime64[ns, pytz.FixedOffset(-300)]', freq=None) + + Default (utc=False) and tz-aware with mixed offsets (for example from a + timezone with daylight savings) returns a simple Index containing + datetime.datetime objects: + + >>> pd.to_datetime(['2020-10-25 02:00 +0200', '2020-10-25 04:00 +0100']) + Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00], \ + dtype='object') + + Default (utc=False) and a mix of tz-aware and tz-naive returns a tz-aware + DatetimeIndex if the tz-naive are datetime... - >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500']) - Index([2018-10-26 12:00:00-05:30, 2018-10-26 12:00:00-05:00], dtype='object') + >>> from datetime import datetime + >>> pd.to_datetime(["2020-01-01 01:00 -01:00", datetime(2020, 1, 1, 3, 0)]) + DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], \ + dtype='datetime64[ns, pytz.FixedOffset(-60)]', freq=None) + + ...but does not if the tz-naive are strings + + >>> pd.to_datetime(["2020-01-01 01:00 -01:00", "2020-01-01 03:00"]) + Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object') + + Special case: mixing tz-aware string and datetime fails when utc=False, + even if they have the same time offset. + + >>> from datetime import datetime, timezone, timedelta + >>> d = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1))) + >>> d + datetime.datetime(2020, 1, 1, 18, 0, \ + tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=82800))) + >>> pd.to_datetime(["2020-01-01 17:00 -0100", d]) + Traceback (most recent call last): + ... + ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 \ + unless utc=True + + Setting utc=True solves most of the above issues, as tz-naive elements + will be localized to UTC, while tz-aware ones will simply be converted to + UTC (exact same datetime, but represented differently): >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'], ... utc=True) - DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], \ + dtype='datetime64[ns, UTC]', freq=None) + + >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 12:00 -0530', + ... datetime(2020, 1, 1, 18), + ... datetime(2020, 1, 1, 18, + ... tzinfo=timezone(-timedelta(hours=1)))], + ... utc=True) + DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 17:30:00+00:00', \ + '2020-01-01 18:00:00+00:00', '2020-01-01 19:00:00+00:00'], \ + dtype='datetime64[ns, UTC]', freq=None) """ if arg is None: return None From 866bdcb26e2de6aec4d3bca8ac4999bb71e8366f Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 5 Oct 2021 11:59:14 +0200 Subject: [PATCH 04/27] minor improvement --- pandas/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index e8f229a56ba83..b5332b1fc60e5 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -764,7 +764,7 @@ def to_datetime( with year first. utc : bool, default None - Control timezone localization and conversion. + Control timezone-related parsing, localization and conversion. - if True, returns a timezone-aware UTC-localized Timestamp, Series or DatetimeIndex. Any tz-naive element will be *localized* as UTC. From cd3ec356cddfb92761b8ddf6bb56962854ce3950 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 5 Oct 2021 12:09:15 +0200 Subject: [PATCH 05/27] Minor fix and improvement again --- pandas/core/tools/datetimes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index b5332b1fc60e5..66b8c30bcfc56 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -843,8 +843,8 @@ def to_datetime( - Series or DataFrame: Series of datetime64 dtype - scalar: Timestamp - Warning: in some situations the return type can not be one of the above - and is is rather datetime.datetime (scalar input) or Series with object + Note: in some situations the return type can not be one of the above + and is rather datetime.datetime (scalar input) or Series with object dtype containing datetime.datetime objects (array-like or Series input). See above documentation for details, as well as examples below. From 1be053afdaa899e59b45825e469806c6be52989e Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 5 Oct 2021 12:10:00 +0200 Subject: [PATCH 06/27] Changed order of output description to match the global section doc --- pandas/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 66b8c30bcfc56..2f2d2b02adb07 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -839,9 +839,9 @@ def to_datetime( If parsing succeeded. Return type depends on input: + - scalar: Timestamp - array-like: DatetimeIndex - Series or DataFrame: Series of datetime64 dtype - - scalar: Timestamp Note: in some situations the return type can not be one of the above and is rather datetime.datetime (scalar input) or Series with object From 41a1e53c1228bf2e6d4b4898142abb9f60a67962 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 5 Oct 2021 15:52:31 +0200 Subject: [PATCH 07/27] Removed the "type: ignore" since the return type hints are now fixed --- pandas/core/tools/datetimes.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 2f2d2b02adb07..7a3a3e6b5e809 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1061,10 +1061,7 @@ def to_datetime( else: result = convert_listlike(np.array([arg]), format)[0] - # error: Incompatible return value type (got "Union[Timestamp, NaTType, - # Series, Index]", expected "Union[DatetimeIndex, Series, float, str, - # NaTType, None]") - return result # type: ignore[return-value] + return result # mappings for assembling units From 95cfc545f82482cda90845a9f491b5498872b18d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 8 Oct 2021 11:34:20 +0200 Subject: [PATCH 08/27] Removed type hint-related mods (will move to a separate pr) --- pandas/core/tools/datetimes.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7a3a3e6b5e809..aa44a7f32a61d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -685,7 +685,7 @@ def to_datetime( infer_datetime_format: bool = False, origin="unix", cache: bool = True, -) -> DatetimeIndex | Index | Series | Timestamp | DatetimeScalar | NaTType | None: +) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None: """ Convert argument to datetime. @@ -1061,7 +1061,10 @@ def to_datetime( else: result = convert_listlike(np.array([arg]), format)[0] - return result + # error: Incompatible return value type (got "Union[Timestamp, NaTType, + # Series, Index]", expected "Union[DatetimeIndex, Series, float, str, + # NaTType, None]") + return result # type: ignore[return-value] # mappings for assembling units From 8ebd77e8ebe622f3e0543a2f742b56f283e63f4f Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 11 Oct 2021 14:40:45 +0200 Subject: [PATCH 09/27] Removed backslash characters from doctests as per code review --- pandas/core/tools/datetimes.py | 35 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index aa44a7f32a61d..68cac03721553 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -935,31 +935,31 @@ def to_datetime( Default (utc=False) and tz-naive returns tz-naive DatetimeIndex: >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15']) - DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], \ - dtype='datetime64[ns]', freq=None) + DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], + dtype='datetime64[ns]', freq=None) Default (utc=False) and tz-aware with constant offset returns tz-aware DatetimeIndex: >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500']) - DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], \ - dtype='datetime64[ns, pytz.FixedOffset(-300)]', freq=None) + DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], + dtype='datetime64[ns, pytz.FixedOffset(-300)]', freq=None) Default (utc=False) and tz-aware with mixed offsets (for example from a timezone with daylight savings) returns a simple Index containing datetime.datetime objects: >>> pd.to_datetime(['2020-10-25 02:00 +0200', '2020-10-25 04:00 +0100']) - Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00], \ - dtype='object') + Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00], + dtype='object') Default (utc=False) and a mix of tz-aware and tz-naive returns a tz-aware DatetimeIndex if the tz-naive are datetime... >>> from datetime import datetime >>> pd.to_datetime(["2020-01-01 01:00 -01:00", datetime(2020, 1, 1, 3, 0)]) - DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], \ - dtype='datetime64[ns, pytz.FixedOffset(-60)]', freq=None) + DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], + dtype='datetime64[ns, pytz.FixedOffset(-60)]', freq=None) ...but does not if the tz-naive are strings @@ -972,13 +972,14 @@ def to_datetime( >>> from datetime import datetime, timezone, timedelta >>> d = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1))) >>> d - datetime.datetime(2020, 1, 1, 18, 0, \ - tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=82800))) + datetime.datetime(2020, 1, 1, 18, 0, + tzinfo=datetime.timezone(datetime.timedelta(days=-1, + seconds=82800))) >>> pd.to_datetime(["2020-01-01 17:00 -0100", d]) Traceback (most recent call last): ... - ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 \ - unless utc=True + ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 + unless utc=True Setting utc=True solves most of the above issues, as tz-naive elements will be localized to UTC, while tz-aware ones will simply be converted to @@ -986,17 +987,17 @@ def to_datetime( >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'], ... utc=True) - DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], \ - dtype='datetime64[ns, UTC]', freq=None) + DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], + dtype='datetime64[ns, UTC]', freq=None) >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 12:00 -0530', ... datetime(2020, 1, 1, 18), ... datetime(2020, 1, 1, 18, ... tzinfo=timezone(-timedelta(hours=1)))], ... utc=True) - DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 17:30:00+00:00', \ - '2020-01-01 18:00:00+00:00', '2020-01-01 19:00:00+00:00'], \ - dtype='datetime64[ns, UTC]', freq=None) + DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 17:30:00+00:00', + '2020-01-01 18:00:00+00:00', '2020-01-01 19:00:00+00:00'], + dtype='datetime64[ns, UTC]', freq=None) """ if arg is None: return None From 8baf7bf2d8fac66892152f7578a5209a1606f2fc Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 11 Oct 2021 14:46:28 +0200 Subject: [PATCH 10/27] As per code review: replaced all "tz-" with "timezone-" --- pandas/core/tools/datetimes.py | 43 +++++++++++++++++----------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 68cac03721553..9507839678ca5 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -767,8 +767,8 @@ def to_datetime( Control timezone-related parsing, localization and conversion. - if True, returns a timezone-aware UTC-localized Timestamp, Series or - DatetimeIndex. Any tz-naive element will be *localized* as UTC. - Any already tz-aware input element (e.g. timezone-aware + DatetimeIndex. Any timezone-naive element will be *localized* as UTC. + Any already timezone-aware input element (e.g. timezone-aware datetime.datetime object, or datetime string with explicit timezone offset) will be *converted* to UTC. @@ -777,7 +777,7 @@ def to_datetime( it will be a timezone-naive Timestamp. For multiple inputs (list, series): - - Tz-aware datetime.datetime inputs are not supported (raise + - Timezone-aware datetime.datetime inputs are not supported (raise ValueError). - The result will be a timezone-aware Series or DatetimeIndex ONLY if all time offsets in string datetime inputs are @@ -856,8 +856,8 @@ def to_datetime( ValueError When another datetime conversion error happens. For example when one of 'year', 'month', day' is missing in a :class:`DataFrame`, or when - a Tz-aware datetime.datetime is found in an array-like of mixed time - offsets, and utc=False. + a Timezone-aware datetime.datetime is found in an array-like of mixed + time offsets, and utc=False. See Also -------- @@ -928,46 +928,47 @@ def to_datetime( dtype='datetime64[ns]', freq=None) .. warning:: By default (utc=False), all items in an input array must - either be all tz-naive, or all tz-aware with the same offset. Mixed - offsets result in datetime.datetime objects being returned instead, - see examples below. + either be all timezone-naive, or all timezone-aware with the same + offset. Mixed offsets result in datetime.datetime objects being + returned instead, see examples below. - Default (utc=False) and tz-naive returns tz-naive DatetimeIndex: + Default (utc=False) and timezone-naive returns timezone-naive + DatetimeIndex: >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15']) DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None) - Default (utc=False) and tz-aware with constant offset returns tz-aware - DatetimeIndex: + Default (utc=False) and timezone-aware with constant offset returns + timezone-aware DatetimeIndex: >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500']) DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], dtype='datetime64[ns, pytz.FixedOffset(-300)]', freq=None) - Default (utc=False) and tz-aware with mixed offsets (for example from a - timezone with daylight savings) returns a simple Index containing + Default (utc=False) and timezone-aware with mixed offsets (for example from + a timezone with daylight savings) returns a simple Index containing datetime.datetime objects: >>> pd.to_datetime(['2020-10-25 02:00 +0200', '2020-10-25 04:00 +0100']) Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00], dtype='object') - Default (utc=False) and a mix of tz-aware and tz-naive returns a tz-aware - DatetimeIndex if the tz-naive are datetime... + Default (utc=False) and a mix of timezone-aware and timezone-naive returns + a timezone-aware DatetimeIndex if the timezone-naive are datetime... >>> from datetime import datetime >>> pd.to_datetime(["2020-01-01 01:00 -01:00", datetime(2020, 1, 1, 3, 0)]) DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], dtype='datetime64[ns, pytz.FixedOffset(-60)]', freq=None) - ...but does not if the tz-naive are strings + ...but does not if the timezone-naive are strings >>> pd.to_datetime(["2020-01-01 01:00 -01:00", "2020-01-01 03:00"]) Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object') - Special case: mixing tz-aware string and datetime fails when utc=False, - even if they have the same time offset. + Special case: mixing timezone-aware string and datetime fails when + utc=False, even if they have the same time offset. >>> from datetime import datetime, timezone, timedelta >>> d = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1))) @@ -981,9 +982,9 @@ def to_datetime( ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 unless utc=True - Setting utc=True solves most of the above issues, as tz-naive elements - will be localized to UTC, while tz-aware ones will simply be converted to - UTC (exact same datetime, but represented differently): + Setting utc=True solves most of the above issues, as timezone-naive + elements will be localized to UTC, while timezone-aware ones will simply be + converted to UTC (exact same datetime, but represented differently): >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'], ... utc=True) From bc269456bced3e55f2cf5c35b3c81bdf21517b3c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 11 Oct 2021 14:48:53 +0200 Subject: [PATCH 11/27] Code review: capitalized if --- pandas/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 9507839678ca5..014020706abb5 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -766,7 +766,7 @@ def to_datetime( utc : bool, default None Control timezone-related parsing, localization and conversion. - - if True, returns a timezone-aware UTC-localized Timestamp, Series or + - If True, returns a timezone-aware UTC-localized Timestamp, Series or DatetimeIndex. Any timezone-naive element will be *localized* as UTC. Any already timezone-aware input element (e.g. timezone-aware datetime.datetime object, or datetime string with explicit timezone From 83ef8502e2e2edbce3e78d8f45d77ab7aae9bc37 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 28 Oct 2021 16:31:46 +0200 Subject: [PATCH 12/27] Compressed output description as per code review. --- pandas/core/tools/datetimes.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 014020706abb5..5410d44ca4411 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -837,17 +837,14 @@ def to_datetime( ------- datetime If parsing succeeded. - Return type depends on input: - - - scalar: Timestamp - - array-like: DatetimeIndex - - Series or DataFrame: Series of datetime64 dtype - - Note: in some situations the return type can not be one of the above - and is rather datetime.datetime (scalar input) or Series with object - dtype containing datetime.datetime objects (array-like or Series - input). See above documentation for details, as well as examples - below. + Return type depends on input (types in parenthesis correspond to + timezone handling issues): + + - scalar: Timestamp (or datetime.datetime) + - array-like: DatetimeIndex (or Series with object dtype containing + datetime.datetime) + - Series or DataFrame: Series of datetime64 dtype (or Series or object + dtype containing datetime.datetime) Raises ------ From 0b217725c1b539c1daf943d8fcbafd6af6e6f230 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 28 Oct 2021 17:02:50 +0200 Subject: [PATCH 13/27] Moved the general summary to a notes section --- pandas/core/tools/datetimes.py | 81 ++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 5410d44ca4411..f837096667863 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -692,44 +692,6 @@ def to_datetime( This function converts a scalar, array-like, :class:`Series` or :class:`DataFrame`/dict-like to a pandas datetime object. - - scalars can be int, float, str, datetime object (from stdlib datetime - module or numpy). They are converted to :class:`Timestamp` when possible, - otherwise they are converted to ``datetime.datetime``. None/NaN/null - scalars are converted to ``NaT``. - - - array-like can contain int, float, str, datetime objects. They are - converted to :class:`DatetimeIndex` when possible, otherwise they are - converted to :class:`Index` with object dtype, containing - ``datetime.datetime``. None/NaN/null entries are converted to ``NaT`` in - both cases. - - - :class:`Series` are converted to :class:`Series` with datetime64 dtype - when possible, otherwise they are converted to :class:`Series` with - object dtype, containing ``datetime.datetime``. None/NaN/null entries - are converted to ``NaT`` in both cases. - - - :class:`DataFrame`/dict-like are converted to :class:`Series` with - datetime64 dtype. For each row a datetime is created from assembling - the various dataframe columns. Column keys can be common abbreviations - like [‘year’, ‘month’, ‘day’, ‘minute’, ‘second’, ‘ms’, ‘us’, ‘ns’]) or - plurals of the same. - - The following causes are responsible for datetime.datetime objects being - returned (possibly inside an Index or a Series with object dtype) instead - of a proper pandas designated type (Timestamp, DatetimeIndex or Series - with datetime64 dtype): - - - when any input element is before Timestamp.min or after Timestamp.max, - see `timestamp limitations - `_. - - - when utc=False (default) and the input is an array-like or Series - containing mixed naive/aware datetime, or aware with mixed time offsets. - Note that this happens in the (quite frequent) situation when the - timezone has a daylight savings policy. In that case you may wish to - use utc=True. - Parameters ---------- arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like @@ -862,6 +824,49 @@ def to_datetime( to_timedelta : Convert argument to timedelta. convert_dtypes : Convert dtypes. + Notes + ----- + + Many input types are supported, and lead to different output types: + + - scalars can be int, float, str, datetime object (from stdlib datetime + module or numpy). They are converted to :class:`Timestamp` when possible, + otherwise they are converted to ``datetime.datetime``. None/NaN/null + scalars are converted to ``NaT``. + + - array-like can contain int, float, str, datetime objects. They are + converted to :class:`DatetimeIndex` when possible, otherwise they are + converted to :class:`Index` with object dtype, containing + ``datetime.datetime``. None/NaN/null entries are converted to ``NaT`` in + both cases. + + - :class:`Series` are converted to :class:`Series` with datetime64 dtype + when possible, otherwise they are converted to :class:`Series` with + object dtype, containing ``datetime.datetime``. None/NaN/null entries + are converted to ``NaT`` in both cases. + + - :class:`DataFrame`/dict-like are converted to :class:`Series` with + datetime64 dtype. For each row a datetime is created from assembling + the various dataframe columns. Column keys can be common abbreviations + like [‘year’, ‘month’, ‘day’, ‘minute’, ‘second’, ‘ms’, ‘us’, ‘ns’]) or + plurals of the same. + + The following causes are responsible for datetime.datetime objects being + returned (possibly inside an Index or a Series with object dtype) instead + of a proper pandas designated type (Timestamp, DatetimeIndex or Series + with datetime64 dtype): + + - when any input element is before Timestamp.min or after Timestamp.max, + see `timestamp limitations + `_. + + - when utc=False (default) and the input is an array-like or Series + containing mixed naive/aware datetime, or aware with mixed time offsets. + Note that this happens in the (quite frequent) situation when the + timezone has a daylight savings policy. In that case you may wish to + use utc=True. + Examples -------- Assembling a datetime from multiple columns of a DataFrame. The keys can be From 83ddfe7bbb4b43227467df2ce63bf8fb76d7e110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sylvain=20Mari=C3=A9?= Date: Thu, 28 Oct 2021 17:05:55 +0200 Subject: [PATCH 14/27] Update pandas/core/tools/datetimes.py --- pandas/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index f837096667863..40b38a3b2568e 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -800,7 +800,7 @@ def to_datetime( datetime If parsing succeeded. Return type depends on input (types in parenthesis correspond to - timezone handling issues): + timezone or out-of-range timestamp handling issues): - scalar: Timestamp (or datetime.datetime) - array-like: DatetimeIndex (or Series with object dtype containing From 8e1ebf0a3deb6c97ff83010c1b5428fd8f864c7d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 17 Dec 2021 18:22:22 +0100 Subject: [PATCH 15/27] As per code review: reduced the utc param description and added structure to the examples section. --- pandas/core/tools/datetimes.py | 111 ++++++++++++++++----------------- 1 file changed, 53 insertions(+), 58 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 40b38a3b2568e..05cd7145d9fab 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -728,29 +728,19 @@ def to_datetime( utc : bool, default None Control timezone-related parsing, localization and conversion. - - If True, returns a timezone-aware UTC-localized Timestamp, Series or - DatetimeIndex. Any timezone-naive element will be *localized* as UTC. - Any already timezone-aware input element (e.g. timezone-aware - datetime.datetime object, or datetime string with explicit timezone - offset) will be *converted* to UTC. - - - If False (default), for scalar inputs, the result will be a - timezone-aware Timestamp if the scalar is timezone-aware, otherwise - it will be a timezone-naive Timestamp. - For multiple inputs (list, series): - - - Timezone-aware datetime.datetime inputs are not supported (raise - ValueError). - - The result will be a timezone-aware Series or DatetimeIndex - ONLY if all time offsets in string datetime inputs are - identical. - - If all inputs are timezone-naive, the result will be - timezone-naive. - - In other cases, for example if the time offset is - not identical in all string entries, the result will be an Index - of dtype object. - - See pandas general documentation about `timezone conversion and + - If ``True``, the function *always* returns a timezone-aware + UTC-localized Timestamp, Series or DatetimeIndex. To do this, + timezone-naive inputs are *localized* as UTC (e.g. + ``01-01-2020 01:00:00`` becomes ``01-01-2020 01:00:00Z``), while + timezone-aware inputs are *converted* to UTC (e.g. + ``01-01-2020 01:00:00+0100`` becomes ``01-01-2020 00:00:00Z``). + + - If ``False`` (default), the result is a "best effort automation", + with some limitations - in particular for timezones with daylight + savings. See :ref:`Examples ` section for + details. + + See also: pandas general documentation about `timezone conversion and localization `_. @@ -869,6 +859,9 @@ def to_datetime( Examples -------- + + **a. Handling various input formats** + Assembling a datetime from multiple columns of a DataFrame. The keys can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or plurals of the same @@ -881,20 +874,7 @@ def to_datetime( 1 2016-03-05 dtype: datetime64[ns] - If a date does not meet the `timestamp limitations - `_, passing errors='ignore' - will return the original input instead of raising any exception. - - Passing errors='coerce' will force an out-of-bounds date to NaT, - in addition to forcing non-dates (or non-parseable dates) to NaT. - - >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') - datetime.datetime(1300, 1, 1, 0, 0) - >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') - NaT - - Passing infer_datetime_format=True can often-times speedup a parsing + Passing ``infer_datetime_format=True`` can often-times speedup a parsing if its not an ISO8601 format exactly, but in a regular format. >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 1000) @@ -929,62 +909,77 @@ def to_datetime( DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None) - .. warning:: By default (utc=False), all items in an input array must - either be all timezone-naive, or all timezone-aware with the same - offset. Mixed offsets result in datetime.datetime objects being - returned instead, see examples below. + **b. Non-convertible date/times** + + If a date does not meet the `timestamp limitations + `_, passing errors='ignore' + will return the original input instead of raising any exception. + + Passing ``errors='coerce'`` will force an out-of-bounds date to ``NaT``, + in addition to forcing non-dates (or non-parseable dates) to ``NaT``. + + >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') + datetime.datetime(1300, 1, 1, 0, 0) + >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') + NaT + + .. _to_datetime_tz_examples: - Default (utc=False) and timezone-naive returns timezone-naive - DatetimeIndex: + **c. Timezones and time offsets** + + The default behaviour (``utc=False``) might be confusing concerning timezones: + + - Timezone-naive inputs are converted to timezone-naive ``DatetimeIndex``: >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15']) DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None) - Default (utc=False) and timezone-aware with constant offset returns - timezone-aware DatetimeIndex: + - Timezone-aware inputs *with constant time offset* are converted to + timezone-aware ``DatetimeIndex``: >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500']) DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], dtype='datetime64[ns, pytz.FixedOffset(-300)]', freq=None) - Default (utc=False) and timezone-aware with mixed offsets (for example from - a timezone with daylight savings) returns a simple Index containing - datetime.datetime objects: + - However, timezone-aware inputs *with mixed time offsets* (for example + issued from a timezone with daylight savings, such as Europe/Paris) + are **not successfully converted** to a ``DatetimeIndex``. Instead a + simple ``Index`` containing ``datetime.datetime`` objects is returned: >>> pd.to_datetime(['2020-10-25 02:00 +0200', '2020-10-25 04:00 +0100']) Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00], dtype='object') - Default (utc=False) and a mix of timezone-aware and timezone-naive returns - a timezone-aware DatetimeIndex if the timezone-naive are datetime... + - A mix of timezone-aware and timezone-naive inputs is converted to + a timezone-aware ``DatetimeIndex`` but only if the timezone-naive + elements are ``datetime.datetime``... >>> from datetime import datetime >>> pd.to_datetime(["2020-01-01 01:00 -01:00", datetime(2020, 1, 1, 3, 0)]) DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], dtype='datetime64[ns, pytz.FixedOffset(-60)]', freq=None) - ...but does not if the timezone-naive are strings + - ...and not if the timezone-naive elements are strings >>> pd.to_datetime(["2020-01-01 01:00 -01:00", "2020-01-01 03:00"]) Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object') - Special case: mixing timezone-aware string and datetime fails when - utc=False, even if they have the same time offset. + - Finally, mixing timezone-aware strings and ``datetime.datetime`` always + raises an error, even if the elements all have the same time offset. >>> from datetime import datetime, timezone, timedelta >>> d = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1))) - >>> d - datetime.datetime(2020, 1, 1, 18, 0, - tzinfo=datetime.timezone(datetime.timedelta(days=-1, - seconds=82800))) >>> pd.to_datetime(["2020-01-01 17:00 -0100", d]) Traceback (most recent call last): ... ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 unless utc=True - Setting utc=True solves most of the above issues, as timezone-naive + | + + Setting ``utc=True`` solves most of the above issues, as timezone-naive elements will be localized to UTC, while timezone-aware ones will simply be converted to UTC (exact same datetime, but represented differently): From 53107796887a5efb2a0d65e439d3308f3cc7c75f Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 17 Dec 2021 18:29:14 +0100 Subject: [PATCH 16/27] Minor edits --- pandas/core/tools/datetimes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 155936c694e04..35759a75e3c0c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -798,7 +798,9 @@ def to_datetime( - scalar: Timestamp (or datetime.datetime) - array-like: DatetimeIndex (or Series with object dtype containing datetime.datetime) - - Series or DataFrame: Series of datetime64 dtype (or Series or object + - Series: Series of datetime64 dtype (or Series of object + dtype containing datetime.datetime) + - DataFrame: Series of datetime64 dtype (or Series of object dtype containing datetime.datetime) Raises From 2b225446b84578d2cbd8a7656e0da3b93cd875bc Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 17 Dec 2021 23:06:57 +0100 Subject: [PATCH 17/27] Changed as per code review --- pandas/core/tools/datetimes.py | 50 ++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 35759a75e3c0c..54bab946e27e4 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -733,15 +733,14 @@ def to_datetime( - If ``True``, the function *always* returns a timezone-aware UTC-localized Timestamp, Series or DatetimeIndex. To do this, - timezone-naive inputs are *localized* as UTC (e.g. - ``01-01-2020 01:00:00`` becomes ``01-01-2020 01:00:00Z``), while - timezone-aware inputs are *converted* to UTC (e.g. - ``01-01-2020 01:00:00+0100`` becomes ``01-01-2020 00:00:00Z``). + timezone-naive inputs are *localized* as UTC, while + timezone-aware inputs are *converted* to UTC. - - If ``False`` (default), the result is a "best effort automation", - with some limitations - in particular for timezones with daylight - savings. See :ref:`Examples ` section for - details. + - If ``False`` (default), inputs will not be coerced to UTC. + Timezone-naive inputs will remain naive, while timezone-aware ones + will keep their time offsets. Limitations exist for mixed + offsets (typically, daylight savings), see :ref:`Examples + ` section for details. See also: pandas general documentation about `timezone conversion and localization @@ -793,7 +792,7 @@ def to_datetime( datetime If parsing succeeded. Return type depends on input (types in parenthesis correspond to - timezone or out-of-range timestamp handling issues): + fallback in case of timezone issues or out-of-range timestamps): - scalar: Timestamp (or datetime.datetime) - array-like: DatetimeIndex (or Series with object dtype containing @@ -865,7 +864,7 @@ def to_datetime( Examples -------- - **a. Handling various input formats** + **Handling various input formats** Assembling a datetime from multiple columns of a DataFrame. The keys can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', @@ -914,7 +913,7 @@ def to_datetime( DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None) - **b. Non-convertible date/times** + **Non-convertible date/times** If a date does not meet the `timestamp limitations >> from datetime import datetime >>> pd.to_datetime(["2020-01-01 01:00 -01:00", datetime(2020, 1, 1, 3, 0)]) DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], dtype='datetime64[ns, pytz.FixedOffset(-60)]', freq=None) - - ...and not if the timezone-naive elements are strings - - >>> pd.to_datetime(["2020-01-01 01:00 -01:00", "2020-01-01 03:00"]) - Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object') - - Finally, mixing timezone-aware strings and ``datetime.datetime`` always raises an error, even if the elements all have the same time offset. @@ -984,15 +978,25 @@ def to_datetime( | - Setting ``utc=True`` solves most of the above issues, as timezone-naive - elements will be localized to UTC, while timezone-aware ones will simply be - converted to UTC (exact same datetime, but represented differently): + Setting ``utc=True`` solves most of the above issues: + + - Timezone-naive inputs are *localized* as UTC + + >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True) + DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'], + dtype='datetime64[ns, UTC]', freq=None) + + - Timezone-aware inputs are *converted* to UTC (the output represents the + exact same datetime, but viewed from the UTC time offset `+00:00`). >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'], ... utc=True) DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) + - Inputs can contain both naive and aware, string or datetime, the above + rules still apply + >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 12:00 -0530', ... datetime(2020, 1, 1, 18), ... datetime(2020, 1, 1, 18, From 2b63ea7abca671c28931804261b28b52f9774a13 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 18 Dec 2021 10:59:20 +0100 Subject: [PATCH 18/27] Changed as per code review --- pandas/core/tools/datetimes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 54bab946e27e4..654781f2aeb80 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -792,7 +792,8 @@ def to_datetime( datetime If parsing succeeded. Return type depends on input (types in parenthesis correspond to - fallback in case of timezone issues or out-of-range timestamps): + fallback in case of unsuccessful timezone or out-of-range timestamp + parsing): - scalar: Timestamp (or datetime.datetime) - array-like: DatetimeIndex (or Series with object dtype containing From 70a7c8f0e41395b9c0813b892249bd50c1e607f7 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 18 Dec 2021 14:42:52 +0100 Subject: [PATCH 19/27] what's new attempt --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index caf3a4281561f..698e894b398b0 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -637,8 +637,8 @@ Timedelta Timezones ^^^^^^^^^ - Bug in :func:`to_datetime` with ``infer_datetime_format=True`` failing to parse zero UTC offset (``Z``) correctly (:issue:`41047`) +- Clarified :func:`to_datetime` documentation concerning parameter ``utc`` and the impact of its default value (``False``) on parsing datetimes from a timezone with varying time offsets (daylight savings) (:issue:`42229`). - Bug in :meth:`Series.dt.tz_convert` resetting index in a :class:`Series` with :class:`CategoricalIndex` (:issue:`43080`) -- Numeric ^^^^^^^ From 7739e12cb85235e242a10f577bc49c2d52d48b93 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 3 Jan 2022 10:20:44 +0100 Subject: [PATCH 20/27] Revert "what's new attempt" This reverts commit 70a7c8f0e41395b9c0813b892249bd50c1e607f7. --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 698e894b398b0..caf3a4281561f 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -637,8 +637,8 @@ Timedelta Timezones ^^^^^^^^^ - Bug in :func:`to_datetime` with ``infer_datetime_format=True`` failing to parse zero UTC offset (``Z``) correctly (:issue:`41047`) -- Clarified :func:`to_datetime` documentation concerning parameter ``utc`` and the impact of its default value (``False``) on parsing datetimes from a timezone with varying time offsets (daylight savings) (:issue:`42229`). - Bug in :meth:`Series.dt.tz_convert` resetting index in a :class:`Series` with :class:`CategoricalIndex` (:issue:`43080`) +- Numeric ^^^^^^^ From 5f4dbb818bb0ef80f1503ce1f3951da8d0499044 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 4 Jan 2022 13:59:54 +0100 Subject: [PATCH 21/27] Changed as per code review: added sphinx directives wherever possible, instead of the double backticks. --- pandas/core/tools/datetimes.py | 154 +++++++++++++++++---------------- 1 file changed, 80 insertions(+), 74 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 654781f2aeb80..1a48dfc669e2c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -697,20 +697,20 @@ def to_datetime( Parameters ---------- arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like - The object to convert to a datetime. If the DataFrame is provided, the method - expects minimally the following columns: "year", "month", "day". + The object to convert to a datetime. If a :class:`DataFrame` is provided, the + method expects minimally the following columns: "year", "month", "day". errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - - If 'coerce', then invalid parsing will be set as NaT. + - If 'coerce', then invalid parsing will be set as :const:`NaT`. - If 'ignore', then invalid parsing will return the input. dayfirst : bool, default False Specify a date parse order if `arg` is str or its list-likes. - If True, parses dates with the day first, eg 10/11/12 is parsed as - 2012-11-10. + If :const:`True`, parses dates with the day first, eg 10/11/12 is parsed + as 2012-11-10. .. warning:: - dayfirst=True is not strict, but will prefer to parse + ``dayfirst=True`` is not strict, but will prefer to parse with day first. If a delimited date string cannot be parsed in accordance with the given `dayfirst` option, e.g. ``to_datetime(['31-12-2021'])``, then a warning will be shown. @@ -718,25 +718,25 @@ def to_datetime( yearfirst : bool, default False Specify a date parse order if `arg` is str or its list-likes. - - If True parses dates with the year first, eg 10/11/12 is parsed as - 2010-11-12. - - If both dayfirst and yearfirst are True, yearfirst is preceded (same - as dateutil). + - If :const:`True` parses dates with the year first, e.g. 10/11/12 is + parsed as 2010-11-12. + - If both dayfirst and yearfirst are :const:`True`, yearfirst is preceded + (same as dateutil). .. warning:: - yearfirst=True is not strict, but will prefer to parse + ``yearfirst=True`` is not strict, but will prefer to parse with year first. utc : bool, default None Control timezone-related parsing, localization and conversion. - - If ``True``, the function *always* returns a timezone-aware - UTC-localized Timestamp, Series or DatetimeIndex. To do this, - timezone-naive inputs are *localized* as UTC, while - timezone-aware inputs are *converted* to UTC. + - If :const:`True`, the function *always* returns a timezone-aware + UTC-localized :class:`Timestamp`, :class:`Series` or + :class:`DatetimeIndex`. To do this, timezone-naive inputs are + *localized* as UTC, while timezone-aware inputs are *converted* to UTC. - - If ``False`` (default), inputs will not be coerced to UTC. + - If :const:`False` (default), inputs will not be coerced to UTC. Timezone-naive inputs will remain naive, while timezone-aware ones will keep their time offsets. Limitations exist for mixed offsets (typically, daylight savings), see :ref:`Examples @@ -748,44 +748,46 @@ def to_datetime( #time-zone-handling>`_. format : str, default None - The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse + The strftime to parse time, e.g. ``"%d/%m/%Y"``. Note that "%f" will parse all the way up to nanoseconds. See `strftime documentation `_ for more information on choices. - exact : bool, True by default + exact : bool, default True Behaves as: - - If True, require an exact format match. - - If False, allow the format to match anywhere in the target string. + - If :const:`True`, require an exact format match. + - If :const:`False`, allow the format to match anywhere in the target + string. unit : str, default 'ns' The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin. - Example, with unit='ms' and origin='unix' (the default), this + Example, with ``unit='ms'`` and ``origin='unix'`` (the default), this would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : bool, default False - If True and no `format` is given, attempt to infer the format of the - datetime strings based on the first non-NaN element, + If :const:`True` and no `format` is given, attempt to infer the format of + the datetime strings based on the first non-NaN element, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. origin : scalar, default 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. - - If 'unix' (or POSIX) time; origin is set to 1970-01-01. - - If 'julian', unit must be 'D', and origin is set to beginning of - Julian Calendar. Julian day number 0 is assigned to the day starting + - If ``'unix'`` (or POSIX) time; origin is set to 1970-01-01. + - If ``'julian'``, unit must be ``'D'``, and origin is set to beginning + of Julian Calendar. Julian day number 0 is assigned to the day starting at noon on January 1, 4713 BC. - If Timestamp convertible, origin is set to Timestamp identified by origin. cache : bool, default True - If True, use a cache of unique, converted dates to apply the datetime - conversion. May produce significant speed-up when parsing duplicate - date strings, especially ones with timezone offsets. The cache is only - used when there are at least 50 values. The presence of out-of-bounds - values will render the cache unusable and may slow down parsing. + If :const:`True`, use a cache of unique, converted dates to apply the + datetime conversion. May produce significant speed-up when parsing + duplicate date strings, especially ones with timezone offsets. The cache + is only used when there are at least 50 values. The presence of + out-of-bounds values will render the cache unusable and may slow down + parsing. .. versionchanged:: 0.25.0 - - changed default value from False to True. + - changed default value from :const:`False` to :const:`True`. Returns ------- @@ -795,13 +797,15 @@ def to_datetime( fallback in case of unsuccessful timezone or out-of-range timestamp parsing): - - scalar: Timestamp (or datetime.datetime) - - array-like: DatetimeIndex (or Series with object dtype containing - datetime.datetime) - - Series: Series of datetime64 dtype (or Series of object - dtype containing datetime.datetime) - - DataFrame: Series of datetime64 dtype (or Series of object - dtype containing datetime.datetime) + - scalar: :class:`Timestamp` (or :class:`datetime.datetime`) + - array-like: :class:`DatetimeIndex` (or :class:`Series` with + :class:`object` dtype containing :class:`datetime.datetime`) + - :class:`Series`: :class:`Series` of :class:`datetime64` dtype (or + :class:`Series` of :class:`object` dtype containing + :class:`datetime.datetime`) + - :class:`DataFrame`: :class:`Series` of :class:`datetime64` dtype (or + :class:`Series` of :class:`object` dtype containing + :class:`datetime.datetime`) Raises ------ @@ -810,8 +814,8 @@ def to_datetime( ValueError When another datetime conversion error happens. For example when one of 'year', 'month', day' is missing in a :class:`DataFrame`, or when - a Timezone-aware datetime.datetime is found in an array-like of mixed - time offsets, and utc=False. + a Timezone-aware :class:`datetime.datetime` is found in an array-like of + mixed time offsets, and ``utc=False``. See Also -------- @@ -824,51 +828,52 @@ def to_datetime( Many input types are supported, and lead to different output types: - - scalars can be int, float, str, datetime object (from stdlib datetime - module or numpy). They are converted to :class:`Timestamp` when possible, - otherwise they are converted to ``datetime.datetime``. None/NaN/null - scalars are converted to ``NaT``. + - scalars can be int, float, str, datetime object (from stdlib :mod:`datetime` + module or :mod:`numpy`). They are converted to :class:`Timestamp` when + possible, otherwise they are converted to :class:`datetime.datetime`. + None/NaN/null scalars are converted to :const:`NaT`. - array-like can contain int, float, str, datetime objects. They are converted to :class:`DatetimeIndex` when possible, otherwise they are - converted to :class:`Index` with object dtype, containing - ``datetime.datetime``. None/NaN/null entries are converted to ``NaT`` in - both cases. + converted to :class:`Index` with :class:`object` dtype, containing + :class:`datetime.datetime`. None/NaN/null entries are converted to + :const:`NaT` in both cases. - - :class:`Series` are converted to :class:`Series` with datetime64 dtype - when possible, otherwise they are converted to :class:`Series` with - object dtype, containing ``datetime.datetime``. None/NaN/null entries - are converted to ``NaT`` in both cases. + - :class:`Series` are converted to :class:`Series` with :class:`datetime64` + dtype when possible, otherwise they are converted to :class:`Series` with + :class:`object` dtype, containing :class:`datetime.datetime`. None/NaN/null + entries are converted to :const:`NaT` in both cases. - :class:`DataFrame`/dict-like are converted to :class:`Series` with - datetime64 dtype. For each row a datetime is created from assembling + :class:`datetime64` dtype. For each row a datetime is created from assembling the various dataframe columns. Column keys can be common abbreviations like [‘year’, ‘month’, ‘day’, ‘minute’, ‘second’, ‘ms’, ‘us’, ‘ns’]) or plurals of the same. - The following causes are responsible for datetime.datetime objects being - returned (possibly inside an Index or a Series with object dtype) instead - of a proper pandas designated type (Timestamp, DatetimeIndex or Series - with datetime64 dtype): + The following causes are responsible for :class:`datetime.datetime` objects + being returned (possibly inside an :class:`Index` or a :class:`Series` with + :class:`object` dtype) instead of a proper pandas designated type + (:class:`Timestamp`, :class:`DatetimeIndex` or :class:`Series` + with :class:`datetime64` dtype): - - when any input element is before Timestamp.min or after Timestamp.max, - see `timestamp limitations + - when any input element is before :const:`Timestamp.min` or after + :const:`Timestamp.max`, see `timestamp limitations `_. - - when utc=False (default) and the input is an array-like or Series - containing mixed naive/aware datetime, or aware with mixed time offsets. - Note that this happens in the (quite frequent) situation when the - timezone has a daylight savings policy. In that case you may wish to - use utc=True. + - when ``utc=False`` (default) and the input is an array-like or + :class:`Series` containing mixed naive/aware datetime, or aware with mixed + time offsets. Note that this happens in the (quite frequent) situation when + the timezone has a daylight savings policy. In that case you may wish to + use ``utc=True``. Examples -------- **Handling various input formats** - Assembling a datetime from multiple columns of a DataFrame. The keys can be - common abbreviations like ['year', 'month', 'day', 'minute', 'second', + Assembling a datetime from multiple columns of a :class:`DataFrame`. The keys + can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or plurals of the same >>> df = pd.DataFrame({'year': [2015, 2016], @@ -921,8 +926,8 @@ def to_datetime( #timeseries-timestamp-limits>`_, passing errors='ignore' will return the original input instead of raising any exception. - Passing ``errors='coerce'`` will force an out-of-bounds date to ``NaT``, - in addition to forcing non-dates (or non-parseable dates) to ``NaT``. + Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`, + in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') datetime.datetime(1300, 1, 1, 0, 0) @@ -935,14 +940,14 @@ def to_datetime( The default behaviour (``utc=False``) is as follows: - - Timezone-naive inputs are converted to timezone-naive ``DatetimeIndex``: + - Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`: >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15']) DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None) - Timezone-aware inputs *with constant time offset* are converted to - timezone-aware ``DatetimeIndex``: + timezone-aware :class:`DatetimeIndex`: >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500']) DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], @@ -950,15 +955,16 @@ def to_datetime( - However, timezone-aware inputs *with mixed time offsets* (for example issued from a timezone with daylight savings, such as Europe/Paris) - are **not successfully converted** to a ``DatetimeIndex``. Instead a - simple ``Index`` containing ``datetime.datetime`` objects is returned: + are **not successfully converted** to a :class:`DatetimeIndex`. Instead a + simple :class:`Index` containing :class:`datetime.datetime` objects is + returned: >>> pd.to_datetime(['2020-10-25 02:00 +0200', '2020-10-25 04:00 +0100']) Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00], dtype='object') - A mix of timezone-aware and timezone-naive inputs is converted to - a timezone-aware ``DatetimeIndex`` if the offsets of the timezone-aware + a timezone-aware :class:`DatetimeIndex` if the offsets of the timezone-aware are constant: >>> from datetime import datetime @@ -966,7 +972,7 @@ def to_datetime( DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], dtype='datetime64[ns, pytz.FixedOffset(-60)]', freq=None) - - Finally, mixing timezone-aware strings and ``datetime.datetime`` always + - Finally, mixing timezone-aware strings and :class:`datetime.datetime` always raises an error, even if the elements all have the same time offset. >>> from datetime import datetime, timezone, timedelta From a2fb1a16d88dbedf09aefec182c5992d436f3a4d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 4 Jan 2022 14:46:23 +0100 Subject: [PATCH 22/27] Changed as per code review: added const role --- pandas/core/tools/datetimes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 1a48dfc669e2c..e451ab72bd40f 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1116,9 +1116,9 @@ def _assemble_from_unit_mappings(arg, errors, tz): arg : DataFrame errors : {'ignore', 'raise', 'coerce'}, default 'raise' - - If 'raise', then invalid parsing will raise an exception - - If 'coerce', then invalid parsing will be set as NaT - - If 'ignore', then invalid parsing will return the input + - If :const:`'raise'`, then invalid parsing will raise an exception + - If :const:`'coerce'`, then invalid parsing will be set as NaT + - If :const:`'ignore'`, then invalid parsing will return the input tz : None or 'utc' Returns From 04312bd354a22b85679f8ba5108540548aeb2a83 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 4 Jan 2022 14:49:08 +0100 Subject: [PATCH 23/27] sphinx role --- pandas/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index e451ab72bd40f..fb5c8da9e9fce 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1117,7 +1117,7 @@ def _assemble_from_unit_mappings(arg, errors, tz): errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If :const:`'raise'`, then invalid parsing will raise an exception - - If :const:`'coerce'`, then invalid parsing will be set as NaT + - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT` - If :const:`'ignore'`, then invalid parsing will return the input tz : None or 'utc' From 514b0c4097292e6821f1c9742767f94d9f246fa3 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 4 Jan 2022 15:03:51 +0100 Subject: [PATCH 24/27] Changed as per code review: sphinx roles --- pandas/core/tools/datetimes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index fb5c8da9e9fce..aa196bc3134ea 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -700,9 +700,9 @@ def to_datetime( The object to convert to a datetime. If a :class:`DataFrame` is provided, the method expects minimally the following columns: "year", "month", "day". errors : {'ignore', 'raise', 'coerce'}, default 'raise' - - If 'raise', then invalid parsing will raise an exception. - - If 'coerce', then invalid parsing will be set as :const:`NaT`. - - If 'ignore', then invalid parsing will return the input. + - If :const:`'raise'`, then invalid parsing will raise an exception. + - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`. + - If :const:`'ignore'`, then invalid parsing will return the input. dayfirst : bool, default False Specify a date parse order if `arg` is str or its list-likes. If :const:`True`, parses dates with the day first, eg 10/11/12 is parsed From fc2395da68af3ad5765f8308a9e00ebb6afff3d2 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 4 Jan 2022 15:39:24 +0100 Subject: [PATCH 25/27] minor change again --- pandas/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index aa196bc3134ea..9a15069c028c7 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -705,7 +705,7 @@ def to_datetime( - If :const:`'ignore'`, then invalid parsing will return the input. dayfirst : bool, default False Specify a date parse order if `arg` is str or its list-likes. - If :const:`True`, parses dates with the day first, eg 10/11/12 is parsed + If :const:`True`, parses dates with the day first, e.g. 10/11/12 is parsed as 2012-11-10. .. warning:: From e0cf3297556228744d8d1100c9db3c349d60355e Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 4 Jan 2022 17:15:13 +0100 Subject: [PATCH 26/27] Last polishing round: sphinx roles and a few fixes --- pandas/core/tools/datetimes.py | 65 ++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 9a15069c028c7..286e52080f1c9 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -698,15 +698,16 @@ def to_datetime( ---------- arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like The object to convert to a datetime. If a :class:`DataFrame` is provided, the - method expects minimally the following columns: "year", "month", "day". + method expects minimally the following columns: :const:`"year"`, + :const:`"month"`, :const:`"day"`. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If :const:`'raise'`, then invalid parsing will raise an exception. - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`. - If :const:`'ignore'`, then invalid parsing will return the input. dayfirst : bool, default False - Specify a date parse order if `arg` is str or its list-likes. - If :const:`True`, parses dates with the day first, e.g. 10/11/12 is parsed - as 2012-11-10. + Specify a date parse order if `arg` is str or its list-like. + If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"` + is parsed as :const:`2012-11-10`. .. warning:: @@ -716,12 +717,12 @@ def to_datetime( ``to_datetime(['31-12-2021'])``, then a warning will be shown. yearfirst : bool, default False - Specify a date parse order if `arg` is str or its list-likes. + Specify a date parse order if `arg` is str or is list-like. - - If :const:`True` parses dates with the year first, e.g. 10/11/12 is - parsed as 2010-11-12. - - If both dayfirst and yearfirst are :const:`True`, yearfirst is preceded - (same as dateutil). + - If :const:`True` parses dates with the year first, e.g. + :const:`"10/11/12"` is parsed as :const:`2010-11-12`. + - If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is + preceded (same as :mod:`dateutil`). .. warning:: @@ -748,14 +749,16 @@ def to_datetime( #time-zone-handling>`_. format : str, default None - The strftime to parse time, e.g. ``"%d/%m/%Y"``. Note that "%f" will parse - all the way up to nanoseconds. See `strftime documentation + The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. Note that + :const:`"%f"` will parse all the way up to nanoseconds. See + `strftime documentation `_ for more information on choices. exact : bool, default True - Behaves as: - - If :const:`True`, require an exact format match. - - If :const:`False`, allow the format to match anywhere in the target + Control how `format` is used: + + - If :const:`True`, require an exact `format` match. + - If :const:`False`, allow the `format` to match anywhere in the target string. unit : str, default 'ns' @@ -764,18 +767,18 @@ def to_datetime( Example, with ``unit='ms'`` and ``origin='unix'`` (the default), this would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : bool, default False - If :const:`True` and no `format` is given, attempt to infer the format of - the datetime strings based on the first non-NaN element, + If :const:`True` and no `format` is given, attempt to infer the format + of the datetime strings based on the first non-NaN element, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. origin : scalar, default 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. - - If ``'unix'`` (or POSIX) time; origin is set to 1970-01-01. - - If ``'julian'``, unit must be ``'D'``, and origin is set to beginning - of Julian Calendar. Julian day number 0 is assigned to the day starting - at noon on January 1, 4713 BC. + - If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01. + - If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to + beginning of Julian Calendar. Julian day number :const:`0` is assigned + to the day starting at noon on January 1, 4713 BC. - If Timestamp convertible, origin is set to Timestamp identified by origin. cache : bool, default True @@ -787,7 +790,7 @@ def to_datetime( parsing. .. versionchanged:: 0.25.0 - - changed default value from :const:`False` to :const:`True`. + changed default value from :const:`False` to :const:`True`. Returns ------- @@ -800,10 +803,10 @@ def to_datetime( - scalar: :class:`Timestamp` (or :class:`datetime.datetime`) - array-like: :class:`DatetimeIndex` (or :class:`Series` with :class:`object` dtype containing :class:`datetime.datetime`) - - :class:`Series`: :class:`Series` of :class:`datetime64` dtype (or + - Series: :class:`Series` of :class:`datetime64` dtype (or :class:`Series` of :class:`object` dtype containing :class:`datetime.datetime`) - - :class:`DataFrame`: :class:`Series` of :class:`datetime64` dtype (or + - DataFrame: :class:`Series` of :class:`datetime64` dtype (or :class:`Series` of :class:`object` dtype containing :class:`datetime.datetime`) @@ -813,9 +816,9 @@ def to_datetime( When parsing a date from string fails. ValueError When another datetime conversion error happens. For example when one - of 'year', 'month', day' is missing in a :class:`DataFrame`, or when - a Timezone-aware :class:`datetime.datetime` is found in an array-like of - mixed time offsets, and ``utc=False``. + of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or + when a Timezone-aware :class:`datetime.datetime` is found in an array-like + of mixed time offsets, and ``utc=False``. See Also -------- @@ -828,23 +831,23 @@ def to_datetime( Many input types are supported, and lead to different output types: - - scalars can be int, float, str, datetime object (from stdlib :mod:`datetime` + - **scalars** can be int, float, str, datetime object (from stdlib :mod:`datetime` module or :mod:`numpy`). They are converted to :class:`Timestamp` when possible, otherwise they are converted to :class:`datetime.datetime`. None/NaN/null scalars are converted to :const:`NaT`. - - array-like can contain int, float, str, datetime objects. They are + - **array-like** can contain int, float, str, datetime objects. They are converted to :class:`DatetimeIndex` when possible, otherwise they are converted to :class:`Index` with :class:`object` dtype, containing :class:`datetime.datetime`. None/NaN/null entries are converted to :const:`NaT` in both cases. - - :class:`Series` are converted to :class:`Series` with :class:`datetime64` + - **Series** are converted to :class:`Series` with :class:`datetime64` dtype when possible, otherwise they are converted to :class:`Series` with :class:`object` dtype, containing :class:`datetime.datetime`. None/NaN/null entries are converted to :const:`NaT` in both cases. - - :class:`DataFrame`/dict-like are converted to :class:`Series` with + - **DataFrame/dict-like** are converted to :class:`Series` with :class:`datetime64` dtype. For each row a datetime is created from assembling the various dataframe columns. Column keys can be common abbreviations like [‘year’, ‘month’, ‘day’, ‘minute’, ‘second’, ‘ms’, ‘us’, ‘ns’]) or @@ -923,7 +926,7 @@ def to_datetime( If a date does not meet the `timestamp limitations `_, passing errors='ignore' + #timeseries-timestamp-limits>`_, passing ``errors='ignore'`` will return the original input instead of raising any exception. Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`, From 1421830c8fd6648f7877400ead0820b44b675d63 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 4 Jan 2022 18:08:59 +0100 Subject: [PATCH 27/27] Fixed typo --- pandas/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 286e52080f1c9..4d9420fc0510d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -705,7 +705,7 @@ def to_datetime( - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`. - If :const:`'ignore'`, then invalid parsing will return the input. dayfirst : bool, default False - Specify a date parse order if `arg` is str or its list-like. + Specify a date parse order if `arg` is str or is list-like. If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"` is parsed as :const:`2012-11-10`.