From b9328bc3bc9459d28fc702e80a6f0637de147e3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sat, 17 Feb 2024 17:03:33 -0500 Subject: [PATCH 1/5] TYP: simplify read_csv/fwf/table overloads --- pandas/_typing.py | 5 +- pandas/io/parsers/readers.py | 496 +++++++---------------------------- 2 files changed, 91 insertions(+), 410 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 85a69f38b3f67..7a6ffaa1a9c99 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -106,9 +106,9 @@ P = ParamSpec("P") if sys.version_info >= (3, 11): - from typing import Self # pyright: ignore[reportUnusedImport] + from typing import Self, Unpack # pyright: ignore[reportUnusedImport] else: - from typing_extensions import Self # pyright: ignore[reportUnusedImport] + from typing_extensions import Self, Unpack # pyright: ignore[reportUnusedImport] else: npt: Any = None @@ -116,6 +116,7 @@ Self: Any = None TypeGuard: Any = None Concatenate: Any = None + Unpack: Any = None HashableT = TypeVar("HashableT", bound=Hashable) HashableT2 = TypeVar("HashableT2", bound=Hashable) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 018f597526acd..6661f917c8868 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -78,6 +78,7 @@ from types import TracebackType from pandas._typing import ( + Unpack, CompressionOptions, CSVEngine, DtypeArg, @@ -478,6 +479,59 @@ class _Fwf_Defaults(TypedDict): widths: None +class _read_shared(TypedDict, total=False): + # annotations shared between read_csv/fwf/table's overloads + # NOTE: Keep in sync with the annotations of the implementation + sep: str | None | lib.NoDefault + delimiter: str | None | lib.NoDefault + header: int | Sequence[int] | None | Literal["infer"] + names: Sequence[Hashable] | None | lib.NoDefault + index_col: IndexLabel | Literal[False] | None + usecols: UsecolsArgType + dtype: DtypeArg | None + engine: CSVEngine | None + converters: Mapping[Hashable, Callable] | None + true_values: list | None + false_values: list | None + skipinitialspace: bool + skiprows: list[int] | int | Callable[[Hashable], bool] | None + skipfooter: int + nrows: int | None + na_values: Hashable | Iterable[Hashable] | Mapping[ + Hashable, Iterable[Hashable] + ] | None + keep_default_na: bool + na_filter: bool + verbose: bool | lib.NoDefault + skip_blank_lines: bool + parse_dates: bool | Sequence[Hashable] | None + infer_datetime_format: bool | lib.NoDefault + keep_date_col: bool | lib.NoDefault + date_parser: Callable | lib.NoDefault + date_format: str | dict[Hashable, str] | None + dayfirst: bool + cache_dates: bool + compression: CompressionOptions + thousands: str | None + decimal: str + lineterminator: str | None + quotechar: str + quoting: int + doublequote: bool + escapechar: str | None + comment: str | None + encoding: str | None + encoding_errors: str | None + dialect: str | csv.Dialect | None + on_bad_lines: str + delim_whitespace: bool | lib.NoDefault + low_memory: bool + memory_map: bool + float_precision: Literal["high", "legacy", "round_trip"] | None + storage_options: StorageOptions | None + dtype_backend: DtypeBackend | lib.NoDefault + + _fwf_defaults: _Fwf_Defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} _c_unsupported = {"skipfooter"} _python_unsupported = {"low_memory", "float_precision"} @@ -624,241 +678,46 @@ def _read( return parser.read(nrows) -# iterator=True -> TextFileReader @overload def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Hashable - | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] - | None = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: Literal[True], chunksize: int | None = ..., - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool | lib.NoDefault = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: Literal["high", "legacy", "round_trip"] | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., + **kwds: Unpack[_read_shared], ) -> TextFileReader: ... -# chunksize=int -> TextFileReader @overload def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Hashable - | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] - | None = ..., - keep_default_na: bool = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: bool = ..., chunksize: int, - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool | lib.NoDefault = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: Literal["high", "legacy", "round_trip"] | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., + **kwds: Unpack[_read_shared], ) -> TextFileReader: ... -# default case -> DataFrame @overload def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Hashable - | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] - | None = ..., - keep_default_na: bool = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: Literal[False] = ..., chunksize: None = ..., - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool | lib.NoDefault = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: Literal["high", "legacy", "round_trip"] | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., + **kwds: Unpack[_read_shared], ) -> DataFrame: ... -# Unions -> DataFrame | TextFileReader @overload def read_csv( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Hashable - | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] - | None = ..., - keep_default_na: bool = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: bool = ..., chunksize: int | None = ..., - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool | lib.NoDefault = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: Literal["high", "legacy", "round_trip"] | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., + **kwds: Unpack[_read_shared], ) -> DataFrame | TextFileReader: ... @@ -1024,230 +883,46 @@ def read_csv( return _read(filepath_or_buffer, kwds) -# iterator=True -> TextFileReader @overload def read_table( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., - keep_default_na: bool = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: Literal[True], chunksize: int | None = ..., - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: str | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., + **kwds: Unpack[_read_shared], ) -> TextFileReader: ... -# chunksize=int -> TextFileReader @overload def read_table( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., - keep_default_na: bool = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: bool = ..., chunksize: int, - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: str | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., + **kwds: Unpack[_read_shared], ) -> TextFileReader: ... -# default -> DataFrame @overload def read_table( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., - keep_default_na: bool = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: Literal[False] = ..., chunksize: None = ..., - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: str | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., + **kwds: Unpack[_read_shared], ) -> DataFrame: ... -# Unions -> DataFrame | TextFileReader @overload def read_table( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, - sep: str | None | lib.NoDefault = ..., - delimiter: str | None | lib.NoDefault = ..., - header: int | Sequence[int] | None | Literal["infer"] = ..., - names: Sequence[Hashable] | None | lib.NoDefault = ..., - index_col: IndexLabel | Literal[False] | None = ..., - usecols: UsecolsArgType = ..., - dtype: DtypeArg | None = ..., - engine: CSVEngine | None = ..., - converters: Mapping[Hashable, Callable] | None = ..., - true_values: list | None = ..., - false_values: list | None = ..., - skipinitialspace: bool = ..., - skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., - skipfooter: int = ..., - nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., - keep_default_na: bool = ..., - na_filter: bool = ..., - verbose: bool | lib.NoDefault = ..., - skip_blank_lines: bool = ..., - parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool | lib.NoDefault = ..., - date_parser: Callable | lib.NoDefault = ..., - date_format: str | dict[Hashable, str] | None = ..., - dayfirst: bool = ..., - cache_dates: bool = ..., iterator: bool = ..., chunksize: int | None = ..., - compression: CompressionOptions = ..., - thousands: str | None = ..., - decimal: str = ..., - lineterminator: str | None = ..., - quotechar: str = ..., - quoting: int = ..., - doublequote: bool = ..., - escapechar: str | None = ..., - comment: str | None = ..., - encoding: str | None = ..., - encoding_errors: str | None = ..., - dialect: str | csv.Dialect | None = ..., - on_bad_lines=..., - delim_whitespace: bool = ..., - low_memory: bool = ..., - memory_map: bool = ..., - float_precision: str | None = ..., - storage_options: StorageOptions = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., + **kwds: Unpack[_read_shared], ) -> DataFrame | TextFileReader: ... @@ -1287,13 +962,16 @@ def read_table( skipfooter: int = 0, nrows: int | None = None, # NA and Missing Data Handling - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = None, + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = None, keep_default_na: bool = True, na_filter: bool = True, verbose: bool | lib.NoDefault = lib.no_default, skip_blank_lines: bool = True, # Datetime Handling - parse_dates: bool | Sequence[Hashable] = False, + parse_dates: bool | Sequence[Hashable] | None = None, infer_datetime_format: bool | lib.NoDefault = lib.no_default, keep_date_col: bool | lib.NoDefault = lib.no_default, date_parser: Callable | lib.NoDefault = lib.no_default, @@ -1322,7 +1000,7 @@ def read_table( delim_whitespace: bool | lib.NoDefault = lib.no_default, low_memory: bool = _c_parser_defaults["low_memory"], memory_map: bool = False, - float_precision: str | None = None, + float_precision: Literal["high", "legacy", "round_trip"] | None = None, storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: @@ -1410,10 +1088,9 @@ def read_fwf( colspecs: Sequence[tuple[int, int]] | str | None = ..., widths: Sequence[int] | None = ..., infer_nrows: int = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., iterator: Literal[True], chunksize: int | None = ..., - **kwds, + **kwds: Unpack[_read_shared], ) -> TextFileReader: ... @@ -1425,10 +1102,9 @@ def read_fwf( colspecs: Sequence[tuple[int, int]] | str | None = ..., widths: Sequence[int] | None = ..., infer_nrows: int = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., iterator: bool = ..., chunksize: int, - **kwds, + **kwds: Unpack[_read_shared], ) -> TextFileReader: ... @@ -1440,10 +1116,9 @@ def read_fwf( colspecs: Sequence[tuple[int, int]] | str | None = ..., widths: Sequence[int] | None = ..., infer_nrows: int = ..., - dtype_backend: DtypeBackend | lib.NoDefault = ..., iterator: Literal[False] = ..., chunksize: None = ..., - **kwds, + **kwds: Unpack[_read_shared], ) -> DataFrame: ... @@ -1454,10 +1129,9 @@ def read_fwf( colspecs: Sequence[tuple[int, int]] | str | None = "infer", widths: Sequence[int] | None = None, infer_nrows: int = 100, - dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, iterator: bool = False, chunksize: int | None = None, - **kwds, + **kwds: Unpack[_read_shared], ) -> DataFrame | TextFileReader: r""" Read a table of fixed-width formatted lines into DataFrame. @@ -1536,7 +1210,7 @@ def read_fwf( # GH#40830 # Ensure length of `colspecs` matches length of `names` names = kwds.get("names") - if names is not None: + if names is not None and names is not lib.no_default: if len(names) != len(colspecs) and colspecs != "infer": # need to check len(index_col) as it might contain # unnamed indices, in which case it's name is not required @@ -1547,20 +1221,26 @@ def read_fwf( if not is_list_like(index_col): len_index = 1 else: + # for mypy: handled in the if-branch + assert index_col is not lib.no_default + len_index = len(index_col) if kwds.get("usecols") is None and len(names) + len_index != len(colspecs): # If usecols is used colspec may be longer than names raise ValueError("Length of colspecs must match length of names") - kwds["colspecs"] = colspecs - kwds["infer_nrows"] = infer_nrows - kwds["engine"] = "python-fwf" - kwds["iterator"] = iterator - kwds["chunksize"] = chunksize - - check_dtype_backend(dtype_backend) - kwds["dtype_backend"] = dtype_backend - return _read(filepath_or_buffer, kwds) + check_dtype_backend(kwds.setdefault("dtype_backend", lib.no_default)) + return _read( + filepath_or_buffer, + kwds + | { + "colspecs": colspecs, + "infer_nrows": infer_nrows, + "engine": "python-fwf", + "iterator": iterator, + "chunksize": chunksize, + }, + ) class TextFileReader(abc.Iterator): From 76cf1b466721ce4efce0767e5c9c376b2e571b2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sat, 17 Feb 2024 17:15:11 -0500 Subject: [PATCH 2/5] isort (keep thinking ruff is configured to do that too) --- pandas/_typing.py | 10 ++++++++-- pandas/io/parsers/readers.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 7a6ffaa1a9c99..c811594896ec1 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -106,9 +106,15 @@ P = ParamSpec("P") if sys.version_info >= (3, 11): - from typing import Self, Unpack # pyright: ignore[reportUnusedImport] + from typing import ( # pyright: ignore[reportUnusedImport] + Self, + Unpack, + ) else: - from typing_extensions import Self, Unpack # pyright: ignore[reportUnusedImport] + from typing_extensions import ( # pyright: ignore[reportUnusedImport] + Self, + Unpack, + ) else: npt: Any = None diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 6661f917c8868..f381dca5c2f1e 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -78,7 +78,6 @@ from types import TracebackType from pandas._typing import ( - Unpack, CompressionOptions, CSVEngine, DtypeArg, @@ -88,6 +87,7 @@ ReadCsvBuffer, Self, StorageOptions, + Unpack, UsecolsArgType, ) _doc_read_csv_and_table = ( From df5851d6bc90fa214398d55ba4953ad3ef0b31b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sat, 17 Feb 2024 17:17:06 -0500 Subject: [PATCH 3/5] another one --- pandas/_typing.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index c811594896ec1..0bcf5284315d2 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -106,15 +106,11 @@ P = ParamSpec("P") if sys.version_info >= (3, 11): - from typing import ( # pyright: ignore[reportUnusedImport] - Self, - Unpack, - ) + from typing import Self # pyright: ignore[reportUnusedImport] + from typing import Unpack # pyright: ignore[reportUnusedImport] else: - from typing_extensions import ( # pyright: ignore[reportUnusedImport] - Self, - Unpack, - ) + from typing_extensions import Self # pyright: ignore[reportUnusedImport] + from typing_extensions import Unpack # pyright: ignore[reportUnusedImport] else: npt: Any = None From d406fbdd4f00e2dfbd5ee6ebd1ec9802bdf5dfe6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sat, 17 Feb 2024 19:40:20 -0500 Subject: [PATCH 4/5] fix docs? --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/parsers/readers.py | 20 +++++--------------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 77c80dcfe7c7e..5fe7d1874b598 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -93,6 +93,7 @@ Other API changes - Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`) - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`) +- :func:`read_table`'s ``parse_dates`` argument defaults to ``None`` to improve consistency with :func:`read_csv` (:issue:`57476`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index f381dca5c2f1e..5f8694d0ed1d0 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -243,12 +243,13 @@ .. deprecated:: 2.2.0 skip_blank_lines : bool, default True If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. -parse_dates : bool, list of Hashable, list of lists or dict of {{Hashable : list}}, \ -default False +parse_dates : bool, None, list of Hashable, list of lists or dict of {{Hashable : list}}, \ +default None The behavior is as follows: - * ``bool``. If ``True`` -> try parsing the index. Note: Automatically set to - ``True`` if ``date_format`` or ``date_parser`` arguments have been passed. + * ``bool``. If ``True`` -> try parsing the index. + * ``None``. Behaves like ``True`` if ``date_parser`` or ``date_format`` are + specified. * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse @@ -1162,17 +1163,6 @@ def read_fwf( infer_nrows : int, default 100 The number of rows to consider when letting the parser determine the `colspecs`. - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' - Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). Behaviour is as follows: - - * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` - (default). - * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` - DataFrame. - - .. versionadded:: 2.0 - **kwds : optional Optional keyword arguments can be passed to ``TextFileReader``. From 210324af7df1b6a5b97ed2a8237df1f925f62f58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sat, 17 Feb 2024 19:58:37 -0500 Subject: [PATCH 5/5] sort whatsnew entry --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/io/parsers/readers.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5fe7d1874b598..b57a258d626ee 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -90,10 +90,10 @@ Other API changes ^^^^^^^^^^^^^^^^^ - 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`) - :attr:`MultiIndex.codes`, :attr:`MultiIndex.levels`, and :attr:`MultiIndex.names` now returns a ``tuple`` instead of a ``FrozenList`` (:issue:`53531`) +- :func:`read_table`'s ``parse_dates`` argument defaults to ``None`` to improve consistency with :func:`read_csv` (:issue:`57476`) - Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`) - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`) -- :func:`read_table`'s ``parse_dates`` argument defaults to ``None`` to improve consistency with :func:`read_csv` (:issue:`57476`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 5f8694d0ed1d0..2a42b4115d1a2 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -243,8 +243,8 @@ .. deprecated:: 2.2.0 skip_blank_lines : bool, default True If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. -parse_dates : bool, None, list of Hashable, list of lists or dict of {{Hashable : list}}, \ -default None +parse_dates : bool, None, list of Hashable, list of lists or dict of {{Hashable : \ +list}}, default None The behavior is as follows: * ``bool``. If ``True`` -> try parsing the index.