From 9b351a188a8d193fb16cff49688083d28a0f714e Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 10 Nov 2021 23:19:56 +0100 Subject: [PATCH 01/14] Start typing parsers --- pandas/io/parsers/base_parser.py | 12 +++- pandas/io/parsers/python_parser.py | 99 ++++++++++++++++++------------ 2 files changed, 69 insertions(+), 42 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 8cdcc05f60266..096a2c8d997fd 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -27,6 +27,7 @@ ArrayLike, DtypeArg, FilePathOrBuffer, + Scalar, ) from pandas.errors import ( ParserError, @@ -231,7 +232,7 @@ def _open_handles(self, src: FilePathOrBuffer, kwds: dict[str, Any]) -> None: errors=kwds.get("encoding_errors", "strict"), ) - def _validate_parse_dates_presence(self, columns: list[str]) -> None: + def _validate_parse_dates_presence(self, columns: list[Scalar]) -> None: """ Check if parse_dates are in columns. @@ -314,12 +315,17 @@ def _should_parse_dates(self, i: int) -> bool: @final def _extract_multi_indexer_columns( - self, header, index_names, col_names, passed_names: bool = False + self, + header: list[list[Scalar | None]], + index_names: list | None, + col_names: None, + passed_names: bool = False, ): """ extract and return the names, index_names, col_names header is a list-of-lists returned from the parsers """ + assert col_names is None if len(header) < 2: return header[0], index_names, col_names, passed_names @@ -610,7 +616,7 @@ def _convert_to_ndarrays( @final def _set_noconvert_dtype_columns( - self, col_indices: list[int], names: list[int | str | tuple] + self, col_indices: list[int], names: list[Scalar | tuple] ) -> set[int]: """ Set the columns that should not undergo dtype conversions. diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index af253fc062632..ce0dfc567ba43 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -19,12 +19,21 @@ import numpy as np import pandas._libs.lib as lib -from pandas._typing import FilePathOrBuffer +from pandas._typing import ( + FilePathOrBuffer, + Scalar, +) from pandas.errors import ( EmptyDataError, ParserError, ) +# BOM character (byte order mark) +# This exists at the beginning of a file to indicate endianness +# of a file (stream). Unfortunately, this marker screws up parsing, +# so we need to remove it if we see it. +from pandas.util._exceptions import find_stack_level + from pandas.core.dtypes.common import is_integer from pandas.core.dtypes.inference import is_dict_like @@ -33,10 +42,6 @@ parser_defaults, ) -# BOM character (byte order mark) -# This exists at the beginning of a file to indicate endianness -# of a file (stream). Unfortunately, this marker screws up parsing, -# so we need to remove it if we see it. _BOM = "\ufeff" @@ -105,9 +110,10 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds): # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. self._col_indices: list[int] | None = None + columns: list[list[Scalar | None]] try: ( - self.columns, + columns, self.num_original_columns, self.unnamed_cols, ) = self._infer_columns() @@ -117,7 +123,8 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds): # Now self.columns has the set of columns that we will process. # The original set is stored in self.original_columns. - if len(self.columns) > 1: + self.columns: list[Scalar | tuple] + if len(columns) > 1: # we are processing a multi index column # error: Cannot determine type of 'index_names' # error: Cannot determine type of 'col_names' @@ -127,17 +134,17 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds): self.col_names, _, ) = self._extract_multi_indexer_columns( - self.columns, + columns, self.index_names, # type: ignore[has-type] self.col_names, # type: ignore[has-type] ) # Update list of original names to include all indices. self.num_original_columns = len(self.columns) else: - self.columns = self.columns[0] + self.columns = columns[0] # get popped off for index - self.orig_names: list[int | str | tuple] = list(self.columns) + self.orig_names: list[Scalar | tuple] = list(self.columns) # needs to be cleaned/refactored # multiple date column thing turning into a real spaghetti factory @@ -352,11 +359,13 @@ def _clean_mapping(mapping): clean_dtypes, ) - def _infer_columns(self): + def _infer_columns( + self, + ) -> tuple[list[list[Scalar | None]], int, set[Scalar | None]]: names = self.names num_original_columns = 0 clear_buffer = True - unnamed_cols: set[str | int | None] = set() + unnamed_cols: set[Scalar | None] = set() if self.header is not None: header = self.header @@ -370,7 +379,7 @@ def _infer_columns(self): have_mi_columns = False header = [header] - columns: list[list[int | str | None]] = [] + columns: list[list[Scalar | None]] = [] for level, hr in enumerate(header): try: line = self._buffered_line() @@ -399,7 +408,7 @@ def _infer_columns(self): line = self.names[:] - this_columns: list[int | str | None] = [] + this_columns: list[Scalar | None] = [] this_unnamed_cols = [] for i, c in enumerate(line): @@ -527,10 +536,10 @@ def _infer_columns(self): def _handle_usecols( self, - columns: list[list[str | int | None]], - usecols_key: list[str | int | None], + columns: list[list[Scalar | None]], + usecols_key: list[Scalar | None], num_original_columns: int, - ): + ) -> list[list[Scalar | None]]: """ Sets self._col_indices @@ -563,7 +572,7 @@ def _handle_usecols( "Defining usecols with out of bounds indices is deprecated " "and will raise a ParserError in a future version.", FutureWarning, - stacklevel=8, + stacklevel=find_stack_level(), ) col_indices = self.usecols @@ -583,7 +592,7 @@ def _buffered_line(self): else: return self._next_line() - def _check_for_bom(self, first_row): + def _check_for_bom(self, first_row: list[Scalar]) -> list[Scalar]: """ Checks whether the file begins with the BOM character. If it does, remove it. In addition, if there is quoting @@ -614,6 +623,7 @@ def _check_for_bom(self, first_row): return first_row first_row_bom = first_row[0] + new_row: str if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar: start = 2 @@ -632,9 +642,11 @@ def _check_for_bom(self, first_row): # No quotation so just remove BOM from first element new_row = first_row_bom[1:] - return [new_row] + first_row[1:] - def _is_line_empty(self, line): + new_row_list: list[Scalar] = [new_row] + return new_row_list + first_row[1:] + + def _is_line_empty(self, line: list[Scalar]) -> bool: """ Check if a line is empty or not. @@ -649,7 +661,7 @@ def _is_line_empty(self, line): """ return not line or all(not x for x in line) - def _next_line(self): + def _next_line(self) -> list[Scalar]: if isinstance(self.data, list): while self.skipfunc(self.pos): self.pos += 1 @@ -703,7 +715,7 @@ def _next_line(self): self.buf.append(line) return line - def _alert_malformed(self, msg, row_num): + def _alert_malformed(self, msg: str, row_num: int) -> None: """ Alert a user about a malformed row, depending on value of `self.on_bad_lines` enum. @@ -713,10 +725,12 @@ def _alert_malformed(self, msg, row_num): Parameters ---------- - msg : The error message to display. - row_num : The row number where the parsing error occurred. - Because this row number is displayed, we 1-index, - even though we 0-index internally. + msg: str + The error message to display. + row_num: int + The row number where the parsing error occurred. + Because this row number is displayed, we 1-index, + even though we 0-index internally. """ if self.on_bad_lines == self.BadLineHandleMethod.ERROR: raise ParserError(msg) @@ -724,7 +738,7 @@ def _alert_malformed(self, msg, row_num): base = f"Skipping line {row_num}: " sys.stderr.write(base + msg + "\n") - def _next_iter_line(self, row_num): + def _next_iter_line(self, row_num: int) -> list[Scalar] | None: """ Wrapper around iterating through `self.data` (CSV source). @@ -734,12 +748,15 @@ def _next_iter_line(self, row_num): Parameters ---------- - row_num : The row number of the line being parsed. + row_num: int + The row number of the line being parsed. """ try: # assert for mypy, data is Iterator[str] or None, would error in next assert self.data is not None - return next(self.data) + line = next(self.data) + assert isinstance(line, list) + return line except csv.Error as e: if ( self.on_bad_lines == self.BadLineHandleMethod.ERROR @@ -768,7 +785,7 @@ def _next_iter_line(self, row_num): self._alert_malformed(msg, row_num) return None - def _check_comments(self, lines): + def _check_comments(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: if self.comment is None: return lines ret = [] @@ -789,19 +806,19 @@ def _check_comments(self, lines): ret.append(rl) return ret - def _remove_empty_lines(self, lines): + def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: """ Iterate through the lines and remove any that are either empty or contain only one whitespace value Parameters ---------- - lines : array-like + lines : list of list of Scalars The array of lines that we are to filter. Returns ------- - filtered_lines : array-like + filtered_lines : list of list of Scalars The same array of lines with the "empty" ones removed. """ ret = [] @@ -815,7 +832,7 @@ def _remove_empty_lines(self, lines): ret.append(line) return ret - def _check_thousands(self, lines): + def _check_thousands(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: if self.thousands is None: return lines @@ -823,7 +840,9 @@ def _check_thousands(self, lines): lines=lines, search=self.thousands, replace="" ) - def _search_replace_num_columns(self, lines, search, replace): + def _search_replace_num_columns( + self, lines: list[list[Scalar]], search: str, replace: str + ) -> list[list[Scalar]]: ret = [] for line in lines: rl = [] @@ -840,7 +859,7 @@ def _search_replace_num_columns(self, lines, search, replace): ret.append(rl) return ret - def _check_decimal(self, lines): + def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: if self.decimal == parser_defaults["decimal"]: return lines @@ -848,12 +867,12 @@ def _check_decimal(self, lines): lines=lines, search=self.decimal, replace="." ) - def _clear_buffer(self): + def _clear_buffer(self) -> None: self.buf = [] _implicit_index = False - def _get_index_name(self, columns): + def _get_index_name(self, columns: list[Scalar | tuple]): """ Try several cases to get lines: @@ -868,11 +887,13 @@ def _get_index_name(self, columns): orig_names = list(columns) columns = list(columns) + line: list[Scalar] | None try: line = self._next_line() except StopIteration: line = None + next_line: list[Scalar] | None try: next_line = self._next_line() except StopIteration: From a1a465edcaf459039e88369430c400312c9e6f02 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 11 Nov 2021 21:27:57 +0100 Subject: [PATCH 02/14] Remove parameter --- pandas/io/parsers/base_parser.py | 6 ++---- pandas/io/parsers/c_parser_wrapper.py | 1 - pandas/io/parsers/python_parser.py | 1 - 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 096a2c8d997fd..5c1bca9cd81bb 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -316,18 +316,16 @@ def _should_parse_dates(self, i: int) -> bool: @final def _extract_multi_indexer_columns( self, - header: list[list[Scalar | None]], + header, index_names: list | None, - col_names: None, passed_names: bool = False, ): """ extract and return the names, index_names, col_names header is a list-of-lists returned from the parsers """ - assert col_names is None if len(header) < 2: - return header[0], index_names, col_names, passed_names + return header[0], index_names, None, passed_names # the names are the tuples of the header that are not the index cols # 0 is the name of the index, assuming index_col is a list of column diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 32ca3aaeba6cc..40a070326a1c0 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -91,7 +91,6 @@ def __init__(self, src: FilePathOrBuffer, **kwds): ) = self._extract_multi_indexer_columns( self._reader.header, self.index_names, # type: ignore[has-type] - self.col_names, # type: ignore[has-type] passed_names, ) else: diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index ce0dfc567ba43..df55564f8a9e8 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -136,7 +136,6 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds): ) = self._extract_multi_indexer_columns( columns, self.index_names, # type: ignore[has-type] - self.col_names, # type: ignore[has-type] ) # Update list of original names to include all indices. self.num_original_columns = len(self.columns) From 9db189bf8950eb8f30453c6dd159670859d634a0 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 12 Nov 2021 11:27:28 +0100 Subject: [PATCH 03/14] Move comment --- pandas/io/parsers/python_parser.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index a6f57bad08b2d..e740bec7a9e52 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -27,11 +27,6 @@ EmptyDataError, ParserError, ) - -# BOM character (byte order mark) -# This exists at the beginning of a file to indicate endianness -# of a file (stream). Unfortunately, this marker screws up parsing, -# so we need to remove it if we see it. from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import is_integer @@ -42,6 +37,10 @@ parser_defaults, ) +# BOM character (byte order mark) +# This exists at the beginning of a file to indicate endianness +# of a file (stream). Unfortunately, this marker screws up parsing, +# so we need to remove it if we see it. _BOM = "\ufeff" From 467421b0c849c2ff5d7d4a8e0b709d9b6cbe9efd Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 12 Nov 2021 13:54:54 +0100 Subject: [PATCH 04/14] Continue typing --- pandas/io/parsers/arrow_parser_wrapper.py | 7 +++- pandas/io/parsers/base_parser.py | 46 ++++++++++++++++++++--- pandas/io/parsers/python_parser.py | 30 ++++++++++----- 3 files changed, 67 insertions(+), 16 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 5b1b178c4f610..596a29bad04b3 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -107,7 +107,12 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame: multi_index_named = False frame.columns = self.names # we only need the frame not the names - frame.columns, frame = self._do_date_conversions(frame.columns, frame) + # error: Incompatible types in assignment (expression has type + # "Union[List[Union[Union[str, int, float, bool], Union[Period, Timestamp, + # Timedelta, Any]]], Index]", variable has type "Index") [assignment] + frame.columns, frame = self._do_date_conversions( # type: ignore[assignment] + frame.columns, frame + ) if self.index_col is not None: for i, item in enumerate(self.index_col): if is_integer(item): diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 5c1bca9cd81bb..3073d7239fbc6 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -13,6 +13,7 @@ Sequence, cast, final, + overload, ) import warnings @@ -33,6 +34,7 @@ ParserError, ParserWarning, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( @@ -54,6 +56,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna +from pandas import DataFrame from pandas.core import algorithms from pandas.core.arrays import Categorical from pandas.core.indexes.api import ( @@ -381,7 +384,7 @@ def extract(r): return names, index_names, col_names, passed_names @final - def _maybe_dedup_names(self, names): + def _maybe_dedup_names(self, names: list[Scalar | tuple]) -> list[Scalar | tuple]: # see gh-7160 and gh-9424: this helps to provide # immediate alleviation of the duplicate names # issue and appears to be satisfactory to users, @@ -389,7 +392,7 @@ def _maybe_dedup_names(self, names): # would be nice! if self.mangle_dupe_cols: names = list(names) # so we can index - counts: DefaultDict[int | str | tuple, int] = defaultdict(int) + counts: DefaultDict[Scalar | tuple, int] = defaultdict(int) is_potential_mi = _is_potential_multi_index(names, self.index_col) for i, col in enumerate(names): @@ -399,6 +402,7 @@ def _maybe_dedup_names(self, names): counts[col] = cur_count + 1 if is_potential_mi: + assert isinstance(col, tuple) col = col[:-1] + (f"{col[-1]}.{cur_count}",) else: col = f"{col}.{cur_count}" @@ -798,7 +802,35 @@ def _cast_types(self, values, cast_type, column): ) from err return values - def _do_date_conversions(self, names, data): + @overload + def _do_date_conversions( + self, + names: Index, + data: DataFrame, + ) -> tuple[list[Scalar] | Index, DataFrame]: + ... + + @overload + def _do_date_conversions( + self, + names: list[Scalar | tuple], + data: dict[Scalar | tuple, ArrayLike] | dict[Scalar | tuple, np.ndarray], + ) -> tuple[ + list[Scalar | tuple], + dict[Scalar | tuple, ArrayLike] | dict[Scalar | tuple, np.ndarray], + ]: + ... + + def _do_date_conversions( + self, + names: list[Scalar | tuple] | Index, + data: dict[Scalar | tuple, ArrayLike] + | dict[Scalar | tuple, np.ndarray] + | DataFrame, + ) -> tuple[ + list[Scalar | tuple] | Index, + dict[Scalar | tuple, ArrayLike] | dict[Scalar | tuple, np.ndarray] | DataFrame, + ]: # returns data, columns if self.parse_dates is not None: @@ -814,7 +846,11 @@ def _do_date_conversions(self, names, data): return names, data - def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None: + def _check_data_length( + self, + columns: list[Scalar | tuple], + data: list[ArrayLike] | list[np.ndarray], + ) -> None: """Checks if length of data is equal to length of column names. One set of trailing commas is allowed. self.index_col not False @@ -834,7 +870,7 @@ def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None: "Length of header or names does not match length of data. This leads " "to a loss of data with index_col=False.", ParserWarning, - stacklevel=6, + stacklevel=find_stack_level(), ) def _evaluate_usecols(self, usecols, names): diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index e740bec7a9e52..9d8408fa38b02 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -11,7 +11,10 @@ import sys from typing import ( DefaultDict, + Dict, Iterator, + Tuple, + Union, cast, ) import warnings @@ -20,6 +23,7 @@ import pandas._libs.lib as lib from pandas._typing import ( + ArrayLike, FilePathOrBuffer, Scalar, ) @@ -173,7 +177,7 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds): ) self.num = re.compile(regex) - def _make_reader(self, f): + def _make_reader(self, f) -> None: sep = self.delimiter if sep is None or len(sep) == 1: @@ -239,7 +243,7 @@ def _read(): # TextIOWrapper, mmap, None]") self.data = reader # type: ignore[assignment] - def read(self, rows=None): + def read(self, rows: int | None = None): try: content = self._get_lines(rows) except StopIteration: @@ -276,14 +280,18 @@ def read(self, rows=None): alldata = self._rows_to_cols(content) data, columns = self._exclude_implicit_index(alldata) - columns, data = self._do_date_conversions(columns, data) + columns, date_data = self._do_date_conversions(columns, data) + data = cast(Dict[Union[Scalar, Tuple], np.ndarray], date_data) - data = self._convert_data(data) - index, columns = self._make_index(data, alldata, columns, indexnamerow) + conv_data = self._convert_data(data) + index, columns = self._make_index(conv_data, alldata, columns, indexnamerow) - return index, columns, data + return index, columns, conv_data - def _exclude_implicit_index(self, alldata): + def _exclude_implicit_index( + self, + alldata: list[np.ndarray], + ) -> tuple[dict[Scalar | tuple, np.ndarray], list[Scalar | tuple]]: names = self._maybe_dedup_names(self.orig_names) offset = 0 @@ -305,7 +313,9 @@ def get_chunk(self, size=None): size = self.chunksize # type: ignore[attr-defined] return self.read(rows=size) - def _convert_data(self, data): + def _convert_data( + self, data: dict[Scalar | tuple, np.ndarray] + ) -> dict[Scalar | tuple, ArrayLike]: # apply converters def _clean_mapping(mapping): """converts col numbers to names""" @@ -931,7 +941,7 @@ def _get_index_name(self, columns: list[Scalar | tuple]): return index_name, orig_names, columns - def _rows_to_cols(self, content): + def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: col_len = self.num_original_columns if self._implicit_index: @@ -1014,7 +1024,7 @@ def _rows_to_cols(self, content): ] return zipped_content - def _get_lines(self, rows=None): + def _get_lines(self, rows: int | None = None): lines = self.buf new_rows = None From a719b926c2ccef69416d9792ead700c15989bbb0 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 12 Nov 2021 20:24:10 +0100 Subject: [PATCH 05/14] Adjust type hints --- pandas/io/parsers/base_parser.py | 33 ++++++++++++++++++++---------- pandas/io/parsers/python_parser.py | 10 +++++---- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 3073d7239fbc6..5faac175cb32e 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -384,7 +384,9 @@ def extract(r): return names, index_names, col_names, passed_names @final - def _maybe_dedup_names(self, names: list[Scalar | tuple]) -> list[Scalar | tuple]: + def _maybe_dedup_names( + self, names: list[Scalar] | list[tuple] + ) -> list[Scalar] | list[tuple]: # see gh-7160 and gh-9424: this helps to provide # immediate alleviation of the duplicate names # issue and appears to be satisfactory to users, @@ -618,7 +620,7 @@ def _convert_to_ndarrays( @final def _set_noconvert_dtype_columns( - self, col_indices: list[int], names: list[Scalar | tuple] + self, col_indices: list[int], names: list[Scalar] | list[tuple] ) -> set[int]: """ Set the columns that should not undergo dtype conversions. @@ -813,23 +815,32 @@ def _do_date_conversions( @overload def _do_date_conversions( self, - names: list[Scalar | tuple], + names: list[Scalar] | list[tuple], data: dict[Scalar | tuple, ArrayLike] | dict[Scalar | tuple, np.ndarray], ) -> tuple[ - list[Scalar | tuple], - dict[Scalar | tuple, ArrayLike] | dict[Scalar | tuple, np.ndarray], + list[Scalar] | list[tuple], + dict[Scalar, ArrayLike] + | dict[tuple, ArrayLike] + | dict[Scalar, np.ndarray] + | dict[tuple, np.ndarray], ]: ... def _do_date_conversions( self, - names: list[Scalar | tuple] | Index, - data: dict[Scalar | tuple, ArrayLike] - | dict[Scalar | tuple, np.ndarray] + names: list[Scalar] | list[tuple] | Index, + data: dict[Scalar, ArrayLike] + | dict[tuple, ArrayLike] + | dict[Scalar, np.ndarray] + | dict[tuple, np.ndarray] | DataFrame, ) -> tuple[ - list[Scalar | tuple] | Index, - dict[Scalar | tuple, ArrayLike] | dict[Scalar | tuple, np.ndarray] | DataFrame, + list[Scalar] | list[tuple] | Index, + dict[Scalar, ArrayLike] + | dict[tuple, ArrayLike] + | dict[Scalar, np.ndarray] + | dict[tuple, np.ndarray] + | DataFrame, ]: # returns data, columns @@ -848,7 +859,7 @@ def _do_date_conversions( def _check_data_length( self, - columns: list[Scalar | tuple], + columns: list[Scalar] | list[tuple], data: list[ArrayLike] | list[np.ndarray], ) -> None: """Checks if length of data is equal to length of column names. diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 9d8408fa38b02..4ab47e5e38ef9 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -127,7 +127,7 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds): # Now self.columns has the set of columns that we will process. # The original set is stored in self.original_columns. # error: Cannot determine type of 'index_names' - self.columns: list[Scalar | tuple] + self.columns: list[Scalar] | list[tuple] ( self.columns, self.index_names, @@ -139,7 +139,7 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds): ) # get popped off for index - self.orig_names: list[Scalar | tuple] = list(self.columns) + self.orig_names: list[Scalar] | list[tuple] = list(self.columns) # needs to be cleaned/refactored # multiple date column thing turning into a real spaghetti factory @@ -291,7 +291,9 @@ def read(self, rows: int | None = None): def _exclude_implicit_index( self, alldata: list[np.ndarray], - ) -> tuple[dict[Scalar | tuple, np.ndarray], list[Scalar | tuple]]: + ) -> tuple[ + dict[Scalar, np.ndarray] | dict[tuple, np.ndarray], list[Scalar] | list[tuple] + ]: names = self._maybe_dedup_names(self.orig_names) offset = 0 @@ -873,7 +875,7 @@ def _clear_buffer(self) -> None: _implicit_index = False - def _get_index_name(self, columns: list[Scalar | tuple]): + def _get_index_name(self, columns: list[Scalar] | list[tuple]): """ Try several cases to get lines: From b39888c4123f9286121e522f3c019e4d6663c928 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 12 Nov 2021 20:36:54 +0100 Subject: [PATCH 06/14] Restrict types --- pandas/io/parsers/base_parser.py | 9 +++++++-- pandas/io/parsers/python_parser.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 5faac175cb32e..a5e42f10269eb 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -235,7 +235,9 @@ def _open_handles(self, src: FilePathOrBuffer, kwds: dict[str, Any]) -> None: errors=kwds.get("encoding_errors", "strict"), ) - def _validate_parse_dates_presence(self, columns: list[Scalar]) -> None: + def _validate_parse_dates_presence( + self, columns: list[Scalar] | list[tuple] + ) -> None: """ Check if parse_dates are in columns. @@ -816,7 +818,10 @@ def _do_date_conversions( def _do_date_conversions( self, names: list[Scalar] | list[tuple], - data: dict[Scalar | tuple, ArrayLike] | dict[Scalar | tuple, np.ndarray], + data: dict[Scalar, ArrayLike] + | dict[tuple, ArrayLike] + | dict[Scalar, np.ndarray] + | dict[tuple, np.ndarray], ) -> tuple[ list[Scalar] | list[tuple], dict[Scalar, ArrayLike] diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 4ab47e5e38ef9..e364a17513ce9 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -256,7 +256,7 @@ def read(self, rows: int | None = None): # done with first read, next time raise StopIteration self._first_chunk = False - columns = list(self.orig_names) + columns: list[Scalar] | list[tuple] = list(self.orig_names) if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 names = self._maybe_dedup_names(self.orig_names) From 16c5ccce14c40314e86591634900cd18d1ecec0c Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 13 Nov 2021 20:28:17 +0100 Subject: [PATCH 07/14] Improve tuple type hint --- pandas/io/parsers/base_parser.py | 36 +++++++++++++++--------------- pandas/io/parsers/python_parser.py | 20 ++++++++++------- 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index a5e42f10269eb..a76f49a1cdbb6 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -236,7 +236,7 @@ def _open_handles(self, src: FilePathOrBuffer, kwds: dict[str, Any]) -> None: ) def _validate_parse_dates_presence( - self, columns: list[Scalar] | list[tuple] + self, columns: list[Scalar] | list[tuple[Scalar, ...]] ) -> None: """ Check if parse_dates are in columns. @@ -387,8 +387,8 @@ def extract(r): @final def _maybe_dedup_names( - self, names: list[Scalar] | list[tuple] - ) -> list[Scalar] | list[tuple]: + self, names: list[Scalar] | list[tuple[Scalar, ...]] + ) -> list[Scalar] | list[tuple[Scalar, ...]]: # see gh-7160 and gh-9424: this helps to provide # immediate alleviation of the duplicate names # issue and appears to be satisfactory to users, @@ -396,7 +396,7 @@ def _maybe_dedup_names( # would be nice! if self.mangle_dupe_cols: names = list(names) # so we can index - counts: DefaultDict[Scalar | tuple, int] = defaultdict(int) + counts: DefaultDict[Scalar | tuple[Scalar, ...], int] = defaultdict(int) is_potential_mi = _is_potential_multi_index(names, self.index_col) for i, col in enumerate(names): @@ -622,7 +622,7 @@ def _convert_to_ndarrays( @final def _set_noconvert_dtype_columns( - self, col_indices: list[int], names: list[Scalar] | list[tuple] + self, col_indices: list[int], names: list[Scalar] | list[tuple[Scalar, ...]] ) -> set[int]: """ Set the columns that should not undergo dtype conversions. @@ -817,34 +817,34 @@ def _do_date_conversions( @overload def _do_date_conversions( self, - names: list[Scalar] | list[tuple], + names: list[Scalar] | list[tuple[Scalar, ...]], data: dict[Scalar, ArrayLike] - | dict[tuple, ArrayLike] + | dict[tuple[Scalar, ...], ArrayLike] | dict[Scalar, np.ndarray] - | dict[tuple, np.ndarray], + | dict[tuple[Scalar, ...], np.ndarray], ) -> tuple[ - list[Scalar] | list[tuple], + list[Scalar] | list[tuple[Scalar, ...]], dict[Scalar, ArrayLike] - | dict[tuple, ArrayLike] + | dict[tuple[Scalar, ...], ArrayLike] | dict[Scalar, np.ndarray] - | dict[tuple, np.ndarray], + | dict[tuple[Scalar, ...], np.ndarray], ]: ... def _do_date_conversions( self, - names: list[Scalar] | list[tuple] | Index, + names: list[Scalar] | list[tuple[Scalar, ...]] | Index, data: dict[Scalar, ArrayLike] - | dict[tuple, ArrayLike] + | dict[tuple[Scalar, ...], ArrayLike] | dict[Scalar, np.ndarray] - | dict[tuple, np.ndarray] + | dict[tuple[Scalar, ...], np.ndarray] | DataFrame, ) -> tuple[ - list[Scalar] | list[tuple] | Index, + list[Scalar] | list[tuple[Scalar, ...]] | Index, dict[Scalar, ArrayLike] - | dict[tuple, ArrayLike] + | dict[tuple[Scalar, ...], ArrayLike] | dict[Scalar, np.ndarray] - | dict[tuple, np.ndarray] + | dict[tuple[Scalar, ...], np.ndarray] | DataFrame, ]: # returns data, columns @@ -864,7 +864,7 @@ def _do_date_conversions( def _check_data_length( self, - columns: list[Scalar] | list[tuple], + columns: list[Scalar] | list[tuple[Scalar, ...]], data: list[ArrayLike] | list[np.ndarray], ) -> None: """Checks if length of data is equal to length of column names. diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index e364a17513ce9..703a33659b8db 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -127,7 +127,7 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds): # Now self.columns has the set of columns that we will process. # The original set is stored in self.original_columns. # error: Cannot determine type of 'index_names' - self.columns: list[Scalar] | list[tuple] + self.columns: list[Scalar] | list[tuple[Scalar, ...]] ( self.columns, self.index_names, @@ -139,7 +139,7 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds): ) # get popped off for index - self.orig_names: list[Scalar] | list[tuple] = list(self.columns) + self.orig_names: list[Scalar] | list[tuple[Scalar, ...]] = list(self.columns) # needs to be cleaned/refactored # multiple date column thing turning into a real spaghetti factory @@ -256,7 +256,7 @@ def read(self, rows: int | None = None): # done with first read, next time raise StopIteration self._first_chunk = False - columns: list[Scalar] | list[tuple] = list(self.orig_names) + columns: list[Scalar] | list[tuple[Scalar, ...]] = list(self.orig_names) if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 names = self._maybe_dedup_names(self.orig_names) @@ -281,7 +281,10 @@ def read(self, rows: int | None = None): data, columns = self._exclude_implicit_index(alldata) columns, date_data = self._do_date_conversions(columns, data) - data = cast(Dict[Union[Scalar, Tuple], np.ndarray], date_data) + data = cast( + Union[Dict[Scalar, np.ndarray], Dict[Tuple[Scalar, ...], np.ndarray]], + date_data, + ) conv_data = self._convert_data(data) index, columns = self._make_index(conv_data, alldata, columns, indexnamerow) @@ -292,7 +295,8 @@ def _exclude_implicit_index( self, alldata: list[np.ndarray], ) -> tuple[ - dict[Scalar, np.ndarray] | dict[tuple, np.ndarray], list[Scalar] | list[tuple] + dict[Scalar, np.ndarray] | dict[tuple[Scalar, ...], np.ndarray], + list[Scalar] | list[tuple[Scalar, ...]], ]: names = self._maybe_dedup_names(self.orig_names) @@ -316,8 +320,8 @@ def get_chunk(self, size=None): return self.read(rows=size) def _convert_data( - self, data: dict[Scalar | tuple, np.ndarray] - ) -> dict[Scalar | tuple, ArrayLike]: + self, data: dict[Scalar, np.ndarray] | dict[tuple[Scalar, ...], np.ndarray] + ) -> dict[Scalar, ArrayLike] | dict[tuple[Scalar, ...], ArrayLike]: # apply converters def _clean_mapping(mapping): """converts col numbers to names""" @@ -875,7 +879,7 @@ def _clear_buffer(self) -> None: _implicit_index = False - def _get_index_name(self, columns: list[Scalar] | list[tuple]): + def _get_index_name(self, columns: list[Scalar] | list[tuple[Scalar, ...]]): """ Try several cases to get lines: From ddb1f23676e1f017370c0737217dc471eec42a1f Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 23 Nov 2021 22:13:41 +0100 Subject: [PATCH 08/14] Adjust typing --- pandas/io/parsers/base_parser.py | 51 +++++++++------------------ pandas/io/parsers/c_parser_wrapper.py | 8 ++--- pandas/io/parsers/python_parser.py | 26 ++++++-------- 3 files changed, 32 insertions(+), 53 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index eacc8d15f3e41..0d4bfb9c175d7 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1,6 +1,9 @@ from __future__ import annotations -from collections import defaultdict +from collections import ( + Hashable, + defaultdict, +) import csv import datetime from enum import Enum @@ -10,6 +13,7 @@ Callable, DefaultDict, Iterable, + Mapping, Sequence, cast, final, @@ -392,9 +396,7 @@ def extract(r): return names, index_names, col_names, passed_names @final - def _maybe_dedup_names( - self, names: list[Scalar] | list[tuple[Scalar, ...]] - ) -> list[Scalar] | list[tuple[Scalar, ...]]: + def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]: # see gh-7160 and gh-9424: this helps to provide # immediate alleviation of the duplicate names # issue and appears to be satisfactory to users, @@ -412,6 +414,7 @@ def _maybe_dedup_names( counts[col] = cur_count + 1 if is_potential_mi: + # for mypy assert isinstance(col, tuple) col = col[:-1] + (f"{col[-1]}.{cur_count}",) else: @@ -544,7 +547,7 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: @final def _convert_to_ndarrays( self, - dct: dict, + dct: Mapping, na_values, na_fvalues, verbose: bool = False, @@ -817,42 +820,22 @@ def _do_date_conversions( self, names: Index, data: DataFrame, - ) -> tuple[list[Scalar] | Index, DataFrame]: + ) -> tuple[Sequence[Hashable] | Index, DataFrame]: ... @overload def _do_date_conversions( self, - names: list[Scalar] | list[tuple[Scalar, ...]], - data: dict[Scalar, ArrayLike] - | dict[tuple[Scalar, ...], ArrayLike] - | dict[Scalar, np.ndarray] - | dict[tuple[Scalar, ...], np.ndarray], - ) -> tuple[ - list[Scalar] | list[tuple[Scalar, ...]], - dict[Scalar, ArrayLike] - | dict[tuple[Scalar, ...], ArrayLike] - | dict[Scalar, np.ndarray] - | dict[tuple[Scalar, ...], np.ndarray], - ]: + names: Sequence[Hashable], + data: Mapping[Hashable, ArrayLike], + ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]: ... def _do_date_conversions( self, - names: list[Scalar] | list[tuple[Scalar, ...]] | Index, - data: dict[Scalar, ArrayLike] - | dict[tuple[Scalar, ...], ArrayLike] - | dict[Scalar, np.ndarray] - | dict[tuple[Scalar, ...], np.ndarray] - | DataFrame, - ) -> tuple[ - list[Scalar] | list[tuple[Scalar, ...]] | Index, - dict[Scalar, ArrayLike] - | dict[tuple[Scalar, ...], ArrayLike] - | dict[Scalar, np.ndarray] - | dict[tuple[Scalar, ...], np.ndarray] - | DataFrame, - ]: + names: Sequence[Hashable] | Index, + data: Mapping[Hashable, ArrayLike] | DataFrame, + ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]: # returns data, columns if self.parse_dates is not None: @@ -870,8 +853,8 @@ def _do_date_conversions( def _check_data_length( self, - columns: list[Scalar] | list[tuple[Scalar, ...]], - data: list[ArrayLike] | list[np.ndarray], + columns: Sequence[Hashable], + data: Sequence[ArrayLike], ) -> None: """Checks if length of data is equal to length of column names. diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index e96df3b3f3782..05c963f2d2552 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -279,7 +279,7 @@ def read(self, nrows=None): data_tups = sorted(data.items()) data = {k: v for k, (i, v) in zip(names, data_tups)} - names, data = self._do_date_conversions(names, data) + names, date_data = self._do_date_conversions(names, data) else: # rename dict keys @@ -302,13 +302,13 @@ def read(self, nrows=None): data = {k: v for k, (i, v) in zip(names, data_tups)} - names, data = self._do_date_conversions(names, data) - index, names = self._make_index(data, alldata, names) + names, date_data = self._do_date_conversions(names, data) + index, names = self._make_index(date_data, alldata, names) # maybe create a mi on the columns names = self._maybe_make_multi_index_columns(names, self.col_names) - return index, names, data + return index, names, date_data def _filter_usecols(self, names): # hackish diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 910b0605c486b..e0572a6f2f63a 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1,6 +1,8 @@ from __future__ import annotations from collections import ( + Mapping, + Sequence, abc, defaultdict, ) @@ -11,10 +13,8 @@ import sys from typing import ( DefaultDict, - Dict, + Hashable, Iterator, - Tuple, - Union, cast, ) import warnings @@ -259,7 +259,7 @@ def read(self, rows: int | None = None): # done with first read, next time raise StopIteration self._first_chunk = False - columns: list[Scalar] | list[tuple[Scalar, ...]] = list(self.orig_names) + columns: Sequence[Hashable] = list(self.orig_names) if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 names = self._maybe_dedup_names(self.orig_names) @@ -284,10 +284,7 @@ def read(self, rows: int | None = None): data, columns = self._exclude_implicit_index(alldata) columns, date_data = self._do_date_conversions(columns, data) - data = cast( - Union[Dict[Scalar, np.ndarray], Dict[Tuple[Scalar, ...], np.ndarray]], - date_data, - ) + data = cast(Mapping[Hashable, np.ndarray], date_data) conv_data = self._convert_data(data) index, columns = self._make_index(conv_data, alldata, columns, indexnamerow) @@ -297,10 +294,7 @@ def read(self, rows: int | None = None): def _exclude_implicit_index( self, alldata: list[np.ndarray], - ) -> tuple[ - dict[Scalar, np.ndarray] | dict[tuple[Scalar, ...], np.ndarray], - list[Scalar] | list[tuple[Scalar, ...]], - ]: + ) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]: names = self._maybe_dedup_names(self.orig_names) offset = 0 @@ -323,8 +317,9 @@ def get_chunk(self, size=None): return self.read(rows=size) def _convert_data( - self, data: dict[Scalar, np.ndarray] | dict[tuple[Scalar, ...], np.ndarray] - ) -> dict[Scalar, ArrayLike] | dict[tuple[Scalar, ...], ArrayLike]: + self, + data: Mapping[Hashable, np.ndarray], + ) -> Mapping[Hashable, ArrayLike]: # apply converters def _clean_mapping(mapping): """converts col numbers to names""" @@ -765,6 +760,7 @@ def _next_iter_line(self, row_num: int) -> list[Scalar] | None: # assert for mypy, data is Iterator[str] or None, would error in next assert self.data is not None line = next(self.data) + # for mypy assert isinstance(line, list) return line except csv.Error as e: @@ -882,7 +878,7 @@ def _clear_buffer(self) -> None: _implicit_index = False - def _get_index_name(self, columns: list[Scalar] | list[tuple[Scalar, ...]]): + def _get_index_name(self, columns: Sequence[Hashable]): """ Try several cases to get lines: From 97425f14a2e318a97ea74e60bd1bf12bf632f1e7 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 23 Nov 2021 22:27:44 +0100 Subject: [PATCH 09/14] Adjust types --- pandas/io/parsers/base_parser.py | 9 +++------ pandas/io/parsers/python_parser.py | 6 +++--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 0d4bfb9c175d7..c793c3713cb60 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -33,7 +33,6 @@ DtypeArg, FilePath, ReadCsvBuffer, - Scalar, ) from pandas.errors import ( ParserError, @@ -244,9 +243,7 @@ def _open_handles( errors=kwds.get("encoding_errors", "strict"), ) - def _validate_parse_dates_presence( - self, columns: list[Scalar] | list[tuple[Scalar, ...]] - ) -> None: + def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> None: """ Check if parse_dates are in columns. @@ -404,7 +401,7 @@ def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]: # would be nice! if self.mangle_dupe_cols: names = list(names) # so we can index - counts: DefaultDict[Scalar | tuple[Scalar, ...], int] = defaultdict(int) + counts: DefaultDict[Hashable, int] = defaultdict(int) is_potential_mi = _is_potential_multi_index(names, self.index_col) for i, col in enumerate(names): @@ -631,7 +628,7 @@ def _convert_to_ndarrays( @final def _set_noconvert_dtype_columns( - self, col_indices: list[int], names: list[Scalar] | list[tuple[Scalar, ...]] + self, col_indices: list[int], names: Sequence[Hashable] ) -> set[int]: """ Set the columns that should not undergo dtype conversions. diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index e0572a6f2f63a..db8e210add447 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -130,7 +130,7 @@ def __init__( # Now self.columns has the set of columns that we will process. # The original set is stored in self.original_columns. # error: Cannot determine type of 'index_names' - self.columns: list[Scalar] | list[tuple[Scalar, ...]] + self.columns: list[Hashable] ( self.columns, self.index_names, @@ -142,7 +142,7 @@ def __init__( ) # get popped off for index - self.orig_names: list[Scalar] | list[tuple[Scalar, ...]] = list(self.columns) + self.orig_names: list[Hashable] = list(self.columns) # needs to be cleaned/refactored # multiple date column thing turning into a real spaghetti factory @@ -878,7 +878,7 @@ def _clear_buffer(self) -> None: _implicit_index = False - def _get_index_name(self, columns: Sequence[Hashable]): + def _get_index_name(self, columns: list[Hashable]): """ Try several cases to get lines: From 287098713220b120c3e92a406001c57d76c56bdf Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 23 Nov 2021 22:38:50 +0100 Subject: [PATCH 10/14] Add docstring --- pandas/io/parsers/base_parser.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index c793c3713cb60..0b58f917dd6f9 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -333,8 +333,18 @@ def _extract_multi_indexer_columns( passed_names: bool = False, ): """ - extract and return the names, index_names, col_names - header is a list-of-lists returned from the parsers + Extract and return the names, index_names, col_names if the column + names are a MultiIndex. + + Parameters + ---------- + header: list of lists + The header rows + index_names: list, optional + The names of the future index + passed_names: bool, default False + A flag specifying if names where passed + """ if len(header) < 2: return header[0], index_names, None, passed_names From 2e85ac54d1aa34a8771113ad030b30d68e2250e1 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 24 Nov 2021 00:04:42 +0100 Subject: [PATCH 11/14] Fix cast --- pandas/io/parsers/python_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index db8e210add447..c4d5e6bf91f7a 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -11,6 +11,7 @@ from io import StringIO import re import sys +import typing from typing import ( DefaultDict, Hashable, @@ -284,7 +285,7 @@ def read(self, rows: int | None = None): data, columns = self._exclude_implicit_index(alldata) columns, date_data = self._do_date_conversions(columns, data) - data = cast(Mapping[Hashable, np.ndarray], date_data) + data = cast(typing.Mapping[typing.Hashable, np.ndarray], date_data) conv_data = self._convert_data(data) index, columns = self._make_index(conv_data, alldata, columns, indexnamerow) From 8bd226bacb2a01879ae84c1f5f4ddaea7886edf8 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 24 Nov 2021 19:06:44 +0100 Subject: [PATCH 12/14] Move import --- pandas/io/parsers/base_parser.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 0b58f917dd6f9..0cc5643a8e593 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1,9 +1,6 @@ from __future__ import annotations -from collections import ( - Hashable, - defaultdict, -) +from collections import defaultdict import csv import datetime from enum import Enum @@ -12,6 +9,7 @@ Any, Callable, DefaultDict, + Hashable, Iterable, Mapping, Sequence, From 91177a116b1493a50b193f41f9ca175287fe1e32 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 24 Nov 2021 19:30:29 +0100 Subject: [PATCH 13/14] Move import --- pandas/io/parsers/python_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index c4d5e6bf91f7a..6148f147bb5b2 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1,8 +1,6 @@ from __future__ import annotations from collections import ( - Mapping, - Sequence, abc, defaultdict, ) @@ -16,6 +14,8 @@ DefaultDict, Hashable, Iterator, + Mapping, + Sequence, cast, ) import warnings From 41aca2ff043399637c7fa8b55223792e12d57c02 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 28 Nov 2021 22:12:36 +0100 Subject: [PATCH 14/14] Fix merge conflicts in typing --- pandas/io/parsers/python_parser.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 4cc240265542a..2d1433a8f21c8 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -281,12 +281,12 @@ def read(self, rows: int | None = None): alldata = self._rows_to_cols(content) data, columns = self._exclude_implicit_index(alldata) - data = self._convert_data(data) - columns, data = self._do_date_conversions(columns, data) + conv_data = self._convert_data(data) + columns, conv_data = self._do_date_conversions(columns, conv_data) - index, columns = self._make_index(data, alldata, columns, indexnamerow) + index, columns = self._make_index(conv_data, alldata, columns, indexnamerow) - return index, columns, data + return index, columns, conv_data def _exclude_implicit_index( self, @@ -461,6 +461,7 @@ def _infer_columns( if clear_buffer: self._clear_buffer() + first_line: list[Scalar] | None if names is not None: # Read first row after header to check if data are longer try: