diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index f40f227259998..4d9420fc0510d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -691,100 +691,134 @@ def to_datetime( """ Convert argument to datetime. + This function converts a scalar, array-like, :class:`Series` or + :class:`DataFrame`/dict-like to a pandas datetime object. + Parameters ---------- arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like - The object to convert to a datetime. If the DataFrame is provided, the method - expects minimally the following columns: "year", "month", "day". + The object to convert to a datetime. If a :class:`DataFrame` is provided, the + method expects minimally the following columns: :const:`"year"`, + :const:`"month"`, :const:`"day"`. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - - If 'raise', then invalid parsing will raise an exception. - - If 'coerce', then invalid parsing will be set as NaT. - - If 'ignore', then invalid parsing will return the input. + - If :const:`'raise'`, then invalid parsing will raise an exception. + - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`. + - If :const:`'ignore'`, then invalid parsing will return the input. dayfirst : bool, default False - Specify a date parse order if `arg` is str or its list-likes. - If True, parses dates with the day first, eg 10/11/12 is parsed as - 2012-11-10. + Specify a date parse order if `arg` is str or is list-like. + If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"` + is parsed as :const:`2012-11-10`. .. warning:: - dayfirst=True is not strict, but will prefer to parse + ``dayfirst=True`` is not strict, but will prefer to parse with day first. If a delimited date string cannot be parsed in accordance with the given `dayfirst` option, e.g. ``to_datetime(['31-12-2021'])``, then a warning will be shown. yearfirst : bool, default False - Specify a date parse order if `arg` is str or its list-likes. + Specify a date parse order if `arg` is str or is list-like. - - If True parses dates with the year first, eg 10/11/12 is parsed as - 2010-11-12. - - If both dayfirst and yearfirst are True, yearfirst is preceded (same - as dateutil). + - If :const:`True` parses dates with the year first, e.g. + :const:`"10/11/12"` is parsed as :const:`2010-11-12`. + - If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is + preceded (same as :mod:`dateutil`). .. warning:: - yearfirst=True is not strict, but will prefer to parse + ``yearfirst=True`` is not strict, but will prefer to parse with year first. utc : bool, default None - Return UTC DatetimeIndex if True (converting any tz-aware - datetime.datetime objects as well). + Control timezone-related parsing, localization and conversion. + + - If :const:`True`, the function *always* returns a timezone-aware + UTC-localized :class:`Timestamp`, :class:`Series` or + :class:`DatetimeIndex`. To do this, timezone-naive inputs are + *localized* as UTC, while timezone-aware inputs are *converted* to UTC. + + - If :const:`False` (default), inputs will not be coerced to UTC. + Timezone-naive inputs will remain naive, while timezone-aware ones + will keep their time offsets. Limitations exist for mixed + offsets (typically, daylight savings), see :ref:`Examples + ` section for details. + + See also: pandas general documentation about `timezone conversion and + localization + `_. + format : str, default None - The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse - all the way up to nanoseconds. - See strftime documentation for more information on choices: - https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. - exact : bool, True by default - Behaves as: - - If True, require an exact format match. - - If False, allow the format to match anywhere in the target string. + The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. Note that + :const:`"%f"` will parse all the way up to nanoseconds. See + `strftime documentation + `_ for more information on choices. + exact : bool, default True + Control how `format` is used: + + - If :const:`True`, require an exact `format` match. + - If :const:`False`, allow the `format` to match anywhere in the target + string. unit : str, default 'ns' The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin. - Example, with unit='ms' and origin='unix' (the default), this + Example, with ``unit='ms'`` and ``origin='unix'`` (the default), this would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : bool, default False - If True and no `format` is given, attempt to infer the format of the - datetime strings based on the first non-NaN element, + If :const:`True` and no `format` is given, attempt to infer the format + of the datetime strings based on the first non-NaN element, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. origin : scalar, default 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. - - If 'unix' (or POSIX) time; origin is set to 1970-01-01. - - If 'julian', unit must be 'D', and origin is set to beginning of - Julian Calendar. Julian day number 0 is assigned to the day starting - at noon on January 1, 4713 BC. + - If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01. + - If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to + beginning of Julian Calendar. Julian day number :const:`0` is assigned + to the day starting at noon on January 1, 4713 BC. - If Timestamp convertible, origin is set to Timestamp identified by origin. cache : bool, default True - If True, use a cache of unique, converted dates to apply the datetime - conversion. May produce significant speed-up when parsing duplicate - date strings, especially ones with timezone offsets. The cache is only - used when there are at least 50 values. The presence of out-of-bounds - values will render the cache unusable and may slow down parsing. + If :const:`True`, use a cache of unique, converted dates to apply the + datetime conversion. May produce significant speed-up when parsing + duplicate date strings, especially ones with timezone offsets. The cache + is only used when there are at least 50 values. The presence of + out-of-bounds values will render the cache unusable and may slow down + parsing. .. versionchanged:: 0.25.0 - - changed default value from False to True. + changed default value from :const:`False` to :const:`True`. Returns ------- datetime If parsing succeeded. - Return type depends on input: - - - list-like: - - DatetimeIndex, if timezone naive or aware with the same timezone - - Index of object dtype, if timezone aware with mixed time offsets - - Series: Series of datetime64 dtype - - DataFrame: Series of datetime64 dtype - - scalar: Timestamp - - In case when it is not possible to return designated types (e.g. when - any element of input is before Timestamp.min or after Timestamp.max) - return will have datetime.datetime type (or corresponding - array/Series). + Return type depends on input (types in parenthesis correspond to + fallback in case of unsuccessful timezone or out-of-range timestamp + parsing): + + - scalar: :class:`Timestamp` (or :class:`datetime.datetime`) + - array-like: :class:`DatetimeIndex` (or :class:`Series` with + :class:`object` dtype containing :class:`datetime.datetime`) + - Series: :class:`Series` of :class:`datetime64` dtype (or + :class:`Series` of :class:`object` dtype containing + :class:`datetime.datetime`) + - DataFrame: :class:`Series` of :class:`datetime64` dtype (or + :class:`Series` of :class:`object` dtype containing + :class:`datetime.datetime`) + + Raises + ------ + ParserError + When parsing a date from string fails. + ValueError + When another datetime conversion error happens. For example when one + of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or + when a Timezone-aware :class:`datetime.datetime` is found in an array-like + of mixed time offsets, and ``utc=False``. See Also -------- @@ -792,10 +826,57 @@ def to_datetime( to_timedelta : Convert argument to timedelta. convert_dtypes : Convert dtypes. + Notes + ----- + + Many input types are supported, and lead to different output types: + + - **scalars** can be int, float, str, datetime object (from stdlib :mod:`datetime` + module or :mod:`numpy`). They are converted to :class:`Timestamp` when + possible, otherwise they are converted to :class:`datetime.datetime`. + None/NaN/null scalars are converted to :const:`NaT`. + + - **array-like** can contain int, float, str, datetime objects. They are + converted to :class:`DatetimeIndex` when possible, otherwise they are + converted to :class:`Index` with :class:`object` dtype, containing + :class:`datetime.datetime`. None/NaN/null entries are converted to + :const:`NaT` in both cases. + + - **Series** are converted to :class:`Series` with :class:`datetime64` + dtype when possible, otherwise they are converted to :class:`Series` with + :class:`object` dtype, containing :class:`datetime.datetime`. None/NaN/null + entries are converted to :const:`NaT` in both cases. + + - **DataFrame/dict-like** are converted to :class:`Series` with + :class:`datetime64` dtype. For each row a datetime is created from assembling + the various dataframe columns. Column keys can be common abbreviations + like [‘year’, ‘month’, ‘day’, ‘minute’, ‘second’, ‘ms’, ‘us’, ‘ns’]) or + plurals of the same. + + The following causes are responsible for :class:`datetime.datetime` objects + being returned (possibly inside an :class:`Index` or a :class:`Series` with + :class:`object` dtype) instead of a proper pandas designated type + (:class:`Timestamp`, :class:`DatetimeIndex` or :class:`Series` + with :class:`datetime64` dtype): + + - when any input element is before :const:`Timestamp.min` or after + :const:`Timestamp.max`, see `timestamp limitations + `_. + + - when ``utc=False`` (default) and the input is an array-like or + :class:`Series` containing mixed naive/aware datetime, or aware with mixed + time offsets. Note that this happens in the (quite frequent) situation when + the timezone has a daylight savings policy. In that case you may wish to + use ``utc=True``. + Examples -------- - Assembling a datetime from multiple columns of a DataFrame. The keys can be - common abbreviations like ['year', 'month', 'day', 'minute', 'second', + + **Handling various input formats** + + Assembling a datetime from multiple columns of a :class:`DataFrame`. The keys + can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or plurals of the same >>> df = pd.DataFrame({'year': [2015, 2016], @@ -806,20 +887,7 @@ def to_datetime( 1 2016-03-05 dtype: datetime64[ns] - If a date does not meet the `timestamp limitations - `_, passing errors='ignore' - will return the original input instead of raising any exception. - - Passing errors='coerce' will force an out-of-bounds date to NaT, - in addition to forcing non-dates (or non-parseable dates) to NaT. - - >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') - datetime.datetime(1300, 1, 1, 0, 0) - >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') - NaT - - Passing infer_datetime_format=True can often-times speedup a parsing + Passing ``infer_datetime_format=True`` can often-times speedup a parsing if its not an ISO8601 format exactly, but in a regular format. >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 1000) @@ -854,16 +922,99 @@ def to_datetime( DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None) - In case input is list-like and the elements of input are of mixed - timezones, return will have object type Index if utc=False. + **Non-convertible date/times** + + If a date does not meet the `timestamp limitations + `_, passing ``errors='ignore'`` + will return the original input instead of raising any exception. + + Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`, + in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. + + >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') + datetime.datetime(1300, 1, 1, 0, 0) + >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') + NaT + + .. _to_datetime_tz_examples: + + **Timezones and time offsets** + + The default behaviour (``utc=False``) is as follows: + + - Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`: + + >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15']) + DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], + dtype='datetime64[ns]', freq=None) + + - Timezone-aware inputs *with constant time offset* are converted to + timezone-aware :class:`DatetimeIndex`: + + >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500']) + DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], + dtype='datetime64[ns, pytz.FixedOffset(-300)]', freq=None) + + - However, timezone-aware inputs *with mixed time offsets* (for example + issued from a timezone with daylight savings, such as Europe/Paris) + are **not successfully converted** to a :class:`DatetimeIndex`. Instead a + simple :class:`Index` containing :class:`datetime.datetime` objects is + returned: - >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500']) - Index([2018-10-26 12:00:00-05:30, 2018-10-26 12:00:00-05:00], dtype='object') + >>> pd.to_datetime(['2020-10-25 02:00 +0200', '2020-10-25 04:00 +0100']) + Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00], + dtype='object') + + - A mix of timezone-aware and timezone-naive inputs is converted to + a timezone-aware :class:`DatetimeIndex` if the offsets of the timezone-aware + are constant: + + >>> from datetime import datetime + >>> pd.to_datetime(["2020-01-01 01:00 -01:00", datetime(2020, 1, 1, 3, 0)]) + DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], + dtype='datetime64[ns, pytz.FixedOffset(-60)]', freq=None) + + - Finally, mixing timezone-aware strings and :class:`datetime.datetime` always + raises an error, even if the elements all have the same time offset. + + >>> from datetime import datetime, timezone, timedelta + >>> d = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1))) + >>> pd.to_datetime(["2020-01-01 17:00 -0100", d]) + Traceback (most recent call last): + ... + ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 + unless utc=True + + | + + Setting ``utc=True`` solves most of the above issues: + + - Timezone-naive inputs are *localized* as UTC + + >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True) + DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'], + dtype='datetime64[ns, UTC]', freq=None) + + - Timezone-aware inputs are *converted* to UTC (the output represents the + exact same datetime, but viewed from the UTC time offset `+00:00`). >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'], ... utc=True) DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) + + - Inputs can contain both naive and aware, string or datetime, the above + rules still apply + + >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 12:00 -0530', + ... datetime(2020, 1, 1, 18), + ... datetime(2020, 1, 1, 18, + ... tzinfo=timezone(-timedelta(hours=1)))], + ... utc=True) + DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 17:30:00+00:00', + '2020-01-01 18:00:00+00:00', '2020-01-01 19:00:00+00:00'], + dtype='datetime64[ns, UTC]', freq=None) """ if arg is None: return None @@ -968,9 +1119,9 @@ def _assemble_from_unit_mappings(arg, errors, tz): arg : DataFrame errors : {'ignore', 'raise', 'coerce'}, default 'raise' - - If 'raise', then invalid parsing will raise an exception - - If 'coerce', then invalid parsing will be set as NaT - - If 'ignore', then invalid parsing will return the input + - If :const:`'raise'`, then invalid parsing will raise an exception + - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT` + - If :const:`'ignore'`, then invalid parsing will return the input tz : None or 'utc' Returns