From a9f733e399171fc2f89eb2eabdcebc820d362672 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 May 2024 15:05:55 -0700 Subject: [PATCH 1/3] CLN: Remove deprecated read_*(date_parser=) --- doc/source/user_guide/io.rst | 13 - doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/excel/_base.py | 38 - pandas/io/parsers/base_parser.py | 91 +-- pandas/io/parsers/readers.py | 30 +- pandas/tests/io/excel/test_writers.py | 13 - pandas/tests/io/parser/test_parse_dates.py | 773 +-------------------- pandas/tests/io/parser/test_read_fwf.py | 13 - 8 files changed, 25 insertions(+), 947 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index bd14abdd9408c..30bbbbdc2926b 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -279,19 +279,6 @@ parse_dates : boolean or list of ints or names or list of lists or dict, default keep_date_col : boolean, default ``False`` If ``True`` and parse_dates specifies combining multiple columns then keep the original columns. -date_parser : function, default ``None`` - Function to use for converting a sequence of string columns to an array of - datetime instances. The default uses ``dateutil.parser.parser`` to do the - conversion. pandas will try to call date_parser in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays (as - defined by parse_dates) as arguments; 2) concatenate (row-wise) the string - values from the columns defined by parse_dates into a single array and pass - that; and 3) call date_parser once for each row using one or more strings - (corresponding to the columns defined by parse_dates) as arguments. - - .. deprecated:: 2.0.0 - Use ``date_format`` instead, or read in as ``object`` and then apply - :func:`to_datetime` as-needed. date_format : str or dict of column -> format, default ``None`` If used in conjunction with ``parse_dates``, will parse dates according to this format. For anything more complex, diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c932d793038c2..d3b4ca85d067c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -254,6 +254,7 @@ Removal of prior version deprecations/changes - Enforced deprecation of :meth:`offsets.Tick.delta`, use ``pd.Timedelta(obj)`` instead (:issue:`55498`) - Enforced deprecation of ``axis=None`` acting the same as ``axis=0`` in the DataFrame reductions ``sum``, ``prod``, ``std``, ``var``, and ``sem``, passing ``axis=None`` will now reduce over both axes; this is particularly the case when doing e.g. ``numpy.sum(df)`` (:issue:`21597`) - Enforced deprecation of ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock`` (:issue:`58467`) +- Enforced deprecation of ``date_parser`` in :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_excel` in favour of ``date_format`` (:issue:`50601`) - Enforced deprecation of ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead. (:issue:`52550`) - Enforced deprecation of argument ``infer_datetime_format`` in :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`) - Enforced deprecation of non-standard (``np.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series`) argument to :func:`api.extensions.take` (:issue:`52981`) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 6063ac098a4dc..dd06c597c1857 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -240,20 +240,6 @@ For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. Note: A fast-path exists for iso8601-formatted dates. -date_parser : function, optional - Function to use for converting a sequence of string columns to an array of - datetime instances. The default uses ``dateutil.parser.parser`` to do the - conversion. Pandas will try to call `date_parser` in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays - (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the - string values from the columns defined by `parse_dates` into a single array - and pass that; and 3) call `date_parser` once for each row using one or - more strings (corresponding to the columns defined by `parse_dates`) as - arguments. - - .. deprecated:: 2.0.0 - Use ``date_format`` instead, or read in as ``object`` and then apply - :func:`to_datetime` as-needed. date_format : str or dict of column -> format, default ``None`` If used in conjunction with ``parse_dates``, will parse dates according to this format. For anything more complex, @@ -398,7 +384,6 @@ def read_excel( na_filter: bool = ..., verbose: bool = ..., parse_dates: list | dict | bool = ..., - date_parser: Callable | lib.NoDefault = ..., date_format: dict[Hashable, str] | str | None = ..., thousands: str | None = ..., decimal: str = ..., @@ -436,7 +421,6 @@ def read_excel( na_filter: bool = ..., verbose: bool = ..., parse_dates: list | dict | bool = ..., - date_parser: Callable | lib.NoDefault = ..., date_format: dict[Hashable, str] | str | None = ..., thousands: str | None = ..., decimal: str = ..., @@ -474,7 +458,6 @@ def read_excel( na_filter: bool = True, verbose: bool = False, parse_dates: list | dict | bool = False, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: dict[Hashable, str] | str | None = None, thousands: str | None = None, decimal: str = ".", @@ -521,7 +504,6 @@ def read_excel( na_filter=na_filter, verbose=verbose, parse_dates=parse_dates, - date_parser=date_parser, date_format=date_format, thousands=thousands, decimal=decimal, @@ -726,7 +708,6 @@ def parse( na_values=None, verbose: bool = False, parse_dates: list | dict | bool = False, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: dict[Hashable, str] | str | None = None, thousands: str | None = None, decimal: str = ".", @@ -795,7 +776,6 @@ def parse( false_values=false_values, na_values=na_values, parse_dates=parse_dates, - date_parser=date_parser, date_format=date_format, thousands=thousands, decimal=decimal, @@ -829,7 +809,6 @@ def _parse_sheet( false_values: Iterable[Hashable] | None = None, na_values=None, parse_dates: list | dict | bool = False, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: dict[Hashable, str] | str | None = None, thousands: str | None = None, decimal: str = ".", @@ -942,7 +921,6 @@ def _parse_sheet( na_values=na_values, skip_blank_lines=False, # GH 39808 parse_dates=parse_dates, - date_parser=date_parser, date_format=date_format, thousands=thousands, decimal=decimal, @@ -1648,7 +1626,6 @@ def parse( nrows: int | None = None, na_values=None, parse_dates: list | dict | bool = False, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: str | dict[Hashable, str] | None = None, thousands: str | None = None, comment: str | None = None, @@ -1737,20 +1714,6 @@ def parse( ``pd.to_datetime`` after ``pd.read_excel``. Note: A fast-path exists for iso8601-formatted dates. - date_parser : function, optional - Function to use for converting a sequence of string columns to an array of - datetime instances. The default uses ``dateutil.parser.parser`` to do the - conversion. Pandas will try to call `date_parser` in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays - (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the - string values from the columns defined by `parse_dates` into a single array - and pass that; and 3) call `date_parser` once for each row using one or - more strings (corresponding to the columns defined by `parse_dates`) as - arguments. - - .. deprecated:: 2.0.0 - Use ``date_format`` instead, or read in as ``object`` and then apply - :func:`to_datetime` as-needed. date_format : str or dict of column -> format, default ``None`` If used in conjunction with ``parse_dates``, will parse dates according to this format. For anything more complex, @@ -1810,7 +1773,6 @@ def parse( nrows=nrows, na_values=na_values, parse_dates=parse_dates, - date_parser=date_parser, date_format=date_format, thousands=thousands, comment=comment, diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 510097aed2a25..ae02d7214d1ee 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -3,7 +3,6 @@ from collections import defaultdict from copy import copy import csv -import datetime from enum import Enum import itertools from typing import ( @@ -127,7 +126,6 @@ def __init__(self, kwds) -> None: self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) self._parse_date_cols: Iterable = [] - self.date_parser = kwds.pop("date_parser", lib.no_default) self.date_format = kwds.pop("date_format", None) self.dayfirst = kwds.pop("dayfirst", False) self.keep_date_col = kwds.pop("keep_date_col", False) @@ -146,7 +144,6 @@ def __init__(self, kwds) -> None: self.cache_dates = kwds.pop("cache_dates", True) self._date_conv = _make_date_converter( - date_parser=self.date_parser, date_format=self.date_format, dayfirst=self.dayfirst, cache_dates=self.cache_dates, @@ -1120,84 +1117,33 @@ def _get_empty_meta( def _make_date_converter( - date_parser=lib.no_default, dayfirst: bool = False, cache_dates: bool = True, date_format: dict[Hashable, str] | str | None = None, ): - if date_parser is not lib.no_default: - warnings.warn( - "The argument 'date_parser' is deprecated and will " - "be removed in a future version. " - "Please use 'date_format' instead, or read your data in as 'object' dtype " - "and then call 'to_datetime'.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if date_parser is not lib.no_default and date_format is not None: - raise TypeError("Cannot use both 'date_parser' and 'date_format'") - - def unpack_if_single_element(arg): - # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 - if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1: - return arg[0] - return arg - def converter(*date_cols, col: Hashable): if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm": return date_cols[0] - if date_parser is lib.no_default: - strs = parsing.concat_date_cols(date_cols) - date_fmt = ( - date_format.get(col) if isinstance(date_format, dict) else date_format - ) + strs = parsing.concat_date_cols(date_cols) + date_fmt = ( + date_format.get(col) if isinstance(date_format, dict) else date_format + ) - str_objs = ensure_object(strs) - try: - result = tools.to_datetime( - str_objs, - format=date_fmt, - utc=False, - dayfirst=dayfirst, - cache=cache_dates, - ) - except (ValueError, TypeError): - # test_usecols_with_parse_dates4 - return str_objs - - if isinstance(result, DatetimeIndex): - arr = result.to_numpy() - arr.flags.writeable = True - return arr - return result._values - else: - try: - pre_parsed = date_parser( - *(unpack_if_single_element(arg) for arg in date_cols) - ) - try: - result = tools.to_datetime( - pre_parsed, - cache=cache_dates, - ) - except (ValueError, TypeError): - # test_read_csv_with_custom_date_parser - result = pre_parsed - if isinstance(result, datetime.datetime): - raise Exception("scalar parser") - return result - except Exception: - # e.g. test_datetime_fractional_seconds - pre_parsed = parsing.try_parse_dates( - parsing.concat_date_cols(date_cols), - parser=date_parser, - ) - try: - return tools.to_datetime(pre_parsed) - except (ValueError, TypeError): - # TODO: not reached in tests 2023-10-27; needed? - return pre_parsed + str_objs = ensure_object(strs) + result = tools.to_datetime( + str_objs, + format=date_fmt, + utc=False, + dayfirst=dayfirst, + cache=cache_dates, + ) + + if isinstance(result, DatetimeIndex): + arr = result.to_numpy() + arr.flags.writeable = True + return arr + return result._values return converter @@ -1230,7 +1176,6 @@ def converter(*date_cols, col: Hashable): "parse_dates": False, "keep_date_col": False, "dayfirst": False, - "date_parser": lib.no_default, "date_format": None, "usecols": None, # 'iterator': False, diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index b9235f7068630..648e5108df77a 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -119,7 +119,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): skip_blank_lines: bool parse_dates: bool | Sequence[Hashable] | None keep_date_col: bool | lib.NoDefault - date_parser: Callable | lib.NoDefault date_format: str | dict[Hashable, str] | None dayfirst: bool cache_dates: bool @@ -306,8 +305,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): The behavior is as follows: * ``bool``. If ``True`` -> try parsing the index. - * ``None``. Behaves like ``True`` if ``date_parser`` or ``date_format`` are - specified. + * ``None``. Behaves like ``True`` if ``date_format`` is specified. * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse @@ -325,20 +323,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): keep_date_col : bool, default False If ``True`` and ``parse_dates`` specifies combining multiple columns then keep the original columns. -date_parser : Callable, optional - Function to use for converting a sequence of string columns to an array of - ``datetime`` instances. The default uses ``dateutil.parser.parser`` to do the - conversion. pandas will try to call ``date_parser`` in three different ways, - advancing to the next if an exception occurs: 1) Pass one or more arrays - (as defined by ``parse_dates``) as arguments; 2) concatenate (row-wise) the - string values from the columns defined by ``parse_dates`` into a single array - and pass that; and 3) call ``date_parser`` once for each row using one or - more strings (corresponding to the columns defined by ``parse_dates``) as - arguments. - - .. deprecated:: 2.0.0 - Use ``date_format`` instead, or read in as ``object`` and then apply - :func:`~pandas.to_datetime` as-needed. date_format : str or dict of column -> format, optional Format to use for parsing dates when used in conjunction with ``parse_dates``. The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See @@ -624,13 +608,10 @@ def _read( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds ) -> DataFrame | TextFileReader: """Generic reader of line files.""" - # if we pass a date_parser and parse_dates=False, we should not parse the + # if we pass a date_format and parse_dates=False, we should not parse the # dates GH#44366 if kwds.get("parse_dates", None) is None: - if ( - kwds.get("date_parser", lib.no_default) is lib.no_default - and kwds.get("date_format", None) is None - ): + if kwds.get("date_format", None) is None: kwds["parse_dates"] = False else: kwds["parse_dates"] = True @@ -749,7 +730,6 @@ def read_csv( # Datetime Handling parse_dates: bool | Sequence[Hashable] | None = None, keep_date_col: bool | lib.NoDefault = lib.no_default, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, cache_dates: bool = True, @@ -928,7 +908,6 @@ def read_table( # Datetime Handling parse_dates: bool | Sequence[Hashable] | None = None, keep_date_col: bool | lib.NoDefault = lib.no_default, - date_parser: Callable | lib.NoDefault = lib.no_default, date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, cache_dates: bool = True, @@ -1638,9 +1617,6 @@ def TextParser(*args, **kwds) -> TextFileReader: Comment out remainder of line parse_dates : bool, default False keep_date_col : bool, default False - date_parser : function, optional - - .. deprecated:: 2.0.0 date_format : str or dict of column -> format, default ``None`` .. versionadded:: 2.0.0 diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 508fc47d0920b..859152db84b7d 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -295,19 +295,6 @@ def test_read_excel_parse_dates(self, tmp_excel): res = pd.read_excel(tmp_excel, parse_dates=["date_strings"], index_col=0) tm.assert_frame_equal(df, res) - date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y") - with tm.assert_produces_warning( - FutureWarning, - match="use 'date_format' instead", - raise_on_extra_warnings=False, - ): - res = pd.read_excel( - tmp_excel, - parse_dates=["date_strings"], - date_parser=date_parser, - index_col=0, - ) - tm.assert_frame_equal(df, res) res = pd.read_excel( tmp_excel, parse_dates=["date_strings"], date_format="%m/%d/%Y", index_col=0 ) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 62e4f6d8c40b5..9c9a10f206a47 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -4,14 +4,12 @@ """ from datetime import ( - date, datetime, timedelta, timezone, ) from io import StringIO -from dateutil.parser import parse as du_parse import numpy as np import pytest import pytz @@ -41,81 +39,6 @@ skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -@xfail_pyarrow -def test_read_csv_with_custom_date_parser(all_parsers): - # GH36111 - def __custom_date_parser(time): - time = time.astype(np.float64) - time = time.astype(int) # convert float seconds to int type - return pd.to_timedelta(time, unit="s") - - testdata = StringIO( - """time e n h - 41047.00 -98573.7297 871458.0640 389.0089 - 41048.00 -98573.7299 871458.0640 389.0089 - 41049.00 -98573.7300 871458.0642 389.0088 - 41050.00 -98573.7299 871458.0643 389.0088 - 41051.00 -98573.7302 871458.0640 389.0086 - """ - ) - result = all_parsers.read_csv_check_warnings( - FutureWarning, - "Please use 'date_format' instead", - testdata, - delim_whitespace=True, - parse_dates=True, - date_parser=__custom_date_parser, - index_col="time", - ) - time = [41047, 41048, 41049, 41050, 41051] - time = pd.TimedeltaIndex([pd.to_timedelta(i, unit="s") for i in time], name="time") - expected = DataFrame( - { - "e": [-98573.7297, -98573.7299, -98573.7300, -98573.7299, -98573.7302], - "n": [871458.0640, 871458.0640, 871458.0642, 871458.0643, 871458.0640], - "h": [389.0089, 389.0089, 389.0088, 389.0088, 389.0086], - }, - index=time, - ) - - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_read_csv_with_custom_date_parser_parse_dates_false(all_parsers): - # GH44366 - def __custom_date_parser(time): - time = time.astype(np.float64) - time = time.astype(int) # convert float seconds to int type - return pd.to_timedelta(time, unit="s") - - testdata = StringIO( - """time e - 41047.00 -93.77 - 41048.00 -95.79 - 41049.00 -98.73 - 41050.00 -93.99 - 41051.00 -97.72 - """ - ) - result = all_parsers.read_csv_check_warnings( - FutureWarning, - "Please use 'date_format' instead", - testdata, - delim_whitespace=True, - parse_dates=False, - date_parser=__custom_date_parser, - index_col="time", - ) - time = Series([41047.00, 41048.00, 41049.00, 41050.00, 41051.00], name="time") - expected = DataFrame( - {"e": [-93.77, -95.79, -98.73, -93.99, -97.72]}, - index=time, - ) - - tm.assert_frame_equal(result, expected) - - @xfail_pyarrow def test_separator_date_conflict(all_parsers): # Regression test for gh-4678 @@ -144,164 +67,6 @@ def test_separator_date_conflict(all_parsers): tm.assert_frame_equal(df, expected) -@pytest.mark.parametrize("keep_date_col", [True, False]) -def test_multiple_date_col_custom(all_parsers, keep_date_col, request): - data = """\ -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - parser = all_parsers - - if keep_date_col and parser.engine == "pyarrow": - # For this to pass, we need to disable auto-inference on the date columns - # in parse_dates. We have no way of doing this though - mark = pytest.mark.xfail( - reason="pyarrow doesn't support disabling auto-inference on column numbers." - ) - request.applymarker(mark) - - def date_parser(*date_cols): - """ - Test date parser. - - Parameters - ---------- - date_cols : args - The list of data columns to parse. - - Returns - ------- - parsed : Series - """ - return parsing.try_parse_dates( - parsing.concat_date_cols(date_cols), parser=du_parse - ) - - kwds = { - "header": None, - "date_parser": date_parser, - "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}, - "keep_date_col": keep_date_col, - "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], - } - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - **kwds, - raise_on_extra_warnings=False, - ) - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - "19990127", - " 19:00:00", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - "19990127", - " 20:00:00", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - "19990127", - " 21:00:00", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - "19990127", - " 21:00:00", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - "19990127", - " 22:00:00", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - "19990127", - " 23:00:00", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "actual", - "nominal", - "X0", - "X1", - "X2", - "X3", - "X4", - "X5", - "X6", - "X7", - "X8", - ], - ) - - if not keep_date_col: - expected = expected.drop(["X1", "X2", "X3"], axis=1) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("container", [list, tuple, Index, Series]) @pytest.mark.parametrize("dim", [1, 2]) def test_concat_date_col_fail(container, dim): @@ -495,110 +260,6 @@ def test_date_col_as_index_col(all_parsers): tm.assert_frame_equal(result, expected) -def test_multiple_date_cols_int_cast(all_parsers): - data = ( - "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900" - ) - parse_dates = {"actual": [1, 2], "nominal": [1, 3]} - parser = all_parsers - - kwds = { - "header": None, - "parse_dates": parse_dates, - "date_parser": pd.to_datetime, - } - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - **kwds, - raise_on_extra_warnings=False, - ) - - expected = DataFrame( - [ - [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81], - [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - -0.99, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - -0.59, - ], - ], - columns=["actual", "nominal", 0, 4], - ) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) - - -def test_multiple_date_col_timestamp_parse(all_parsers): - parser = all_parsers - data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 -05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - parse_dates=[[0, 1]], - header=None, - date_parser=Timestamp, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [ - Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, - 1, - "E", - 0, - np.nan, - 1306.25, - ], - [ - Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, - 8, - "E", - 0, - np.nan, - 1306.25, - ], - ], - columns=["0_1", 2, 3, 4, 5, 6, 7], - ) - tm.assert_frame_equal(result, expected) - - @xfail_pyarrow def test_multiple_date_cols_with_header(all_parsers): parser = all_parsers @@ -729,65 +390,6 @@ def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): parser.read_csv(StringIO(data), parse_dates=parse_dates) -def test_date_parser_int_bug(all_parsers): - # see gh-3071 - parser = all_parsers - data = ( - "posix_timestamp,elapsed,sys,user,queries,query_time,rows," - "accountid,userid,contactid,level,silo,method\n" - "1343103150,0.062353,0,4,6,0.01690,3," - "12345,1,-1,3,invoice_InvoiceResource,search\n" - ) - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - index_col=0, - parse_dates=[0], - # Note: we must pass tz and then drop the tz attribute - # (if we don't CI will flake out depending on the runner's local time) - date_parser=lambda x: datetime.fromtimestamp(int(x), tz=timezone.utc).replace( - tzinfo=None - ), - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [ - 0.062353, - 0, - 4, - 6, - 0.01690, - 3, - 12345, - 1, - -1, - 3, - "invoice_InvoiceResource", - "search", - ] - ], - columns=[ - "elapsed", - "sys", - "user", - "queries", - "query_time", - "rows", - "accountid", - "userid", - "contactid", - "level", - "silo", - "method", - ], - index=Index([Timestamp("2012-07-24 04:12:30")], name="posix_timestamp"), - ) - tm.assert_frame_equal(result, expected) - - @xfail_pyarrow def test_nat_parse(all_parsers): # see gh-3062 @@ -807,26 +409,6 @@ def test_nat_parse(all_parsers): tm.assert_frame_equal(result, df) -@skip_pyarrow -def test_csv_custom_parser(all_parsers): - data = """A,B,C -20090101,a,1,2 -20090102,b,3,4 -20090103,c,4,5 -""" - parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=lambda x: datetime.strptime(x, "%Y%m%d"), - ) - expected = parser.read_csv(StringIO(data), parse_dates=True) - tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), date_format="%Y%m%d") - tm.assert_frame_equal(result, expected) - - @skip_pyarrow def test_parse_dates_implicit_first_col(all_parsers): data = """A,B,C @@ -959,53 +541,6 @@ def test_multi_index_parse_dates(all_parsers, index_col): tm.assert_frame_equal(result, expected) -@xfail_pyarrow -@pytest.mark.parametrize("kwargs", [{"dayfirst": True}, {"day_first": True}]) -def test_parse_dates_custom_euro_format(all_parsers, kwargs): - parser = all_parsers - data = """foo,bar,baz -31/01/2010,1,2 -01/02/2010,1,NA -02/02/2010,1,2 -""" - if "dayfirst" in kwargs: - df = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - names=["time", "Q", "NTU"], - date_parser=lambda d: du_parse(d, **kwargs), - header=0, - index_col=0, - parse_dates=True, - na_values=["NA"], - ) - exp_index = Index( - [datetime(2010, 1, 31), datetime(2010, 2, 1), datetime(2010, 2, 2)], - name="time", - ) - expected = DataFrame( - {"Q": [1, 1, 1], "NTU": [2, np.nan, 2]}, - index=exp_index, - columns=["Q", "NTU"], - ) - tm.assert_frame_equal(df, expected) - else: - msg = "got an unexpected keyword argument 'day_first'" - with pytest.raises(TypeError, match=msg): - parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - names=["time", "Q", "NTU"], - date_parser=lambda d: du_parse(d, **kwargs), - skiprows=[0], - index_col=0, - parse_dates=True, - na_values=["NA"], - ) - - def test_parse_tz_aware(all_parsers): # See gh-1693 parser = all_parsers @@ -1383,26 +918,6 @@ def test_parse_dates_empty_string(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "reader", ["read_csv_check_warnings", "read_table_check_warnings"] -) -def test_parse_dates_date_parser_and_date_format(all_parsers, reader): - # GH 50601 - parser = all_parsers - data = "Date,test\n2012-01-01,1\n,2" - msg = "Cannot use both 'date_parser' and 'date_format'" - with pytest.raises(TypeError, match=msg): - getattr(parser, reader)( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - parse_dates=["Date"], - date_parser=pd.to_datetime, - date_format="ISO8601", - sep=",", - ) - - @xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", @@ -1444,279 +959,6 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow -def test_parse_date_time_multi_level_column_name(all_parsers): - data = """\ -D,T,A,B -date, time,a,b -2001-01-05, 09:00:00, 0.0, 10. -2001-01-06, 00:00:00, 1.0, 11. -""" - parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=[0, 1], - parse_dates={"date_time": [0, 1]}, - date_parser=pd.to_datetime, - ) - - expected_data = [ - [datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0], - [datetime(2001, 1, 6, 0, 0, 0), 1.0, 11.0], - ] - expected = DataFrame(expected_data, columns=["date_time", ("A", "a"), ("B", "b")]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - ( - """\ -date,time,a,b -2001-01-05, 10:00:00, 0.0, 10. -2001-01-05, 00:00:00, 1., 11. -""", - {"header": 0, "parse_dates": {"date_time": [0, 1]}}, - DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10], - [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0], - ], - columns=["date_time", "a", "b"], - ), - ), - ( - ( - "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900" - ), - {"header": None, "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}}, - DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - 0.81, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - 0.01, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - -0.99, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - -0.59, - ], - ], - columns=["actual", "nominal", 0, 4], - ), - ), - ], -) -def test_parse_date_time(all_parsers, data, kwargs, expected): - parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=pd.to_datetime, - **kwargs, - raise_on_extra_warnings=False, - ) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) - - -def test_parse_date_fields(all_parsers): - parser = all_parsers - data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymd": [0, 1, 2]}, - date_parser=lambda x: x, - raise_on_extra_warnings=False, - ) - - expected = DataFrame( - [[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]], - columns=["ymd", "a"], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - ("key", "value", "warn"), - [ - ( - "date_parser", - lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"), - FutureWarning, - ), - ("date_format", "%Y %m %d %H %M %S", None), - ], -) -def test_parse_date_all_fields(all_parsers, key, value, warn): - parser = all_parsers - data = """\ -year,month,day,hour,minute,second,a,b -2001,01,05,10,00,0,0.0,10. -2001,01,5,10,0,00,1.,11. -""" - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, - **{key: value}, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0], - ], - columns=["ymdHMS", "a", "b"], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - ("key", "value", "warn"), - [ - ( - "date_parser", - lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"), - FutureWarning, - ), - ("date_format", "%Y %m %d %H %M %S.%f", None), - ], -) -def test_datetime_fractional_seconds(all_parsers, key, value, warn): - parser = all_parsers - data = """\ -year,month,day,hour,minute,second,a,b -2001,01,05,10,00,0.123456,0.0,10. -2001,01,5,10,0,0.500000,1.,11. -""" - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, - **{key: value}, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0, microsecond=500000), 1.0, 11.0], - ], - columns=["ymdHMS", "a", "b"], - ) - tm.assert_frame_equal(result, expected) - - -def test_generic(all_parsers): - parser = all_parsers - data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - - def parse_function(yy, mm): - return [date(year=int(y), month=int(m), day=1) for y, m in zip(yy, mm)] - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ym": [0, 1]}, - date_parser=parse_function, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]], - columns=["ym", "day", "a"], - ) - expected["ym"] = expected["ym"].astype("datetime64[ns]") - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_date_parser_resolution_if_not_ns(all_parsers): - # see gh-10245 - parser = all_parsers - data = """\ -date,time,prn,rxstatus -2013-11-03,19:00:00,126,00E80000 -2013-11-03,19:00:00,23,00E80000 -2013-11-03,19:00:00,13,00E80000 -""" - - def date_parser(dt, time): - try: - arr = dt + "T" + time - except TypeError: - # dt & time are date/time objects - arr = [datetime.combine(d, t) for d, t in zip(dt, time)] - return np.array(arr, dtype="datetime64[s]") - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=date_parser, - parse_dates={"datetime": ["date", "time"]}, - index_col=["datetime", "prn"], - ) - - datetimes = np.array(["2013-11-03T19:00:00"] * 3, dtype="datetime64[s]") - expected = DataFrame( - data={"rxstatus": ["00E80000"] * 3}, - index=MultiIndex.from_arrays( - [datetimes, [126, 23, 13]], - names=["datetime", "prn"], - ), - ) - tm.assert_frame_equal(result, expected) - - def test_parse_date_column_with_empty_string(all_parsers): # see gh-6428 parser = all_parsers @@ -2128,14 +1370,7 @@ def test_infer_first_column_as_index(all_parsers): @xfail_pyarrow # pyarrow engine doesn't support passing a dict for na_values -@pytest.mark.parametrize( - ("key", "value", "warn"), - [ - ("date_parser", lambda x: pd.to_datetime(x, format="%Y-%m-%d"), FutureWarning), - ("date_format", "%Y-%m-%d", None), - ], -) -def test_replace_nans_before_parsing_dates(all_parsers, key, value, warn): +def test_replace_nans_before_parsing_dates(all_parsers): # GH#26203 parser = all_parsers data = """Test @@ -2145,13 +1380,11 @@ def test_replace_nans_before_parsing_dates(all_parsers, key, value, warn): # 2017-09-09 """ - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", + result = parser.read_csv( StringIO(data), na_values={"Test": ["#", "0"]}, parse_dates=["Test"], - **{key: value}, + date_format="%Y-%m-%d", ) expected = DataFrame( { diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index b62fcc04c375c..547afb1b25a04 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -4,7 +4,6 @@ engine is set to 'python-fwf' internally. """ -from datetime import datetime from io import ( BytesIO, StringIO, @@ -284,17 +283,6 @@ def test_fwf_regression(): 2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379 2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039 """ - - with tm.assert_produces_warning(FutureWarning, match="use 'date_format' instead"): - result = read_fwf( - StringIO(data), - index_col=0, - header=None, - names=names, - widths=widths, - parse_dates=True, - date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"), - ) expected = DataFrame( [ [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192], @@ -314,7 +302,6 @@ def test_fwf_regression(): ), columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"], ) - tm.assert_frame_equal(result, expected) result = read_fwf( StringIO(data), index_col=0, From 8e37a49761deb8a025583c93cfc75479c8d767ae Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 May 2024 15:08:39 -0700 Subject: [PATCH 2/3] Add comment --- pandas/io/parsers/base_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index ae02d7214d1ee..b3cf8fc6368ea 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1124,7 +1124,8 @@ def _make_date_converter( def converter(*date_cols, col: Hashable): if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm": return date_cols[0] - + # TODO: Can we remove concat_date_cols after deprecation of parsing + # multiple cols? strs = parsing.concat_date_cols(date_cols) date_fmt = ( date_format.get(col) if isinstance(date_format, dict) else date_format From 124db86b4c577054a809a6add4a7e15416fa7736 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 May 2024 15:37:09 -0700 Subject: [PATCH 3/3] Add back try except --- pandas/io/parsers/base_parser.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index b3cf8fc6368ea..c9ed653babd7a 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1132,13 +1132,18 @@ def converter(*date_cols, col: Hashable): ) str_objs = ensure_object(strs) - result = tools.to_datetime( - str_objs, - format=date_fmt, - utc=False, - dayfirst=dayfirst, - cache=cache_dates, - ) + try: + result = tools.to_datetime( + str_objs, + format=date_fmt, + utc=False, + dayfirst=dayfirst, + cache=cache_dates, + ) + except (ValueError, TypeError): + # test_usecols_with_parse_dates4 + # test_multi_index_parse_dates + return str_objs if isinstance(result, DatetimeIndex): arr = result.to_numpy()