From e5097a422469615b89b30c83be56dae2e58b826d Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 28 Apr 2023 13:53:24 -0700 Subject: [PATCH 1/8] REF: avoid modifying self.index_col --- pandas/io/parsers/arrow_parser_wrapper.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index b7b2ddf0293b5..a8c1d1a7eeade 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -121,13 +121,15 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: # we only need the frame not the names frame.columns, frame = self._do_date_conversions(frame.columns, frame) if self.index_col is not None: + index_to_set = self.index_col.copy() for i, item in enumerate(self.index_col): if is_integer(item): - self.index_col[i] = frame.columns[item] + index_to_set[i] = frame.columns[item] # String case elif item not in frame.columns: raise ValueError(f"Index {item} invalid") - frame.set_index(self.index_col, drop=True, inplace=True) + + frame.set_index(index_to_set, drop=True, inplace=True) # Clear names if headerless and no name given if self.header is None and not multi_index_named: frame.index.names = [None] * len(frame.index.names) From 72a577bb2ea68101feda52ea434a3b41a6ab628f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 28 Apr 2023 15:03:56 -0700 Subject: [PATCH 2/8] REF: remove unused, do less in init --- pandas/io/parsers/c_parser_wrapper.py | 11 ----------- pandas/io/parsers/python_parser.py | 5 ++++- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index a6647df947961..7ad16c9c61f01 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -344,17 +344,6 @@ def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]: ] return names - def _get_index_names(self): - names = list(self._reader.header[0]) - idx_names = None - - if self._reader.leading_cols == 0 and self.index_col is not None: - (idx_names, names, self.index_col) = self._clean_index_names( - names, self.index_col - ) - - return names, idx_names - def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): if try_parse_dates and self._should_parse_dates(index): values = self._date_conv( diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 0a39d7299d1bf..3ab41991c8843 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -28,6 +28,7 @@ EmptyDataError, ParserError, ) +from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( is_bool_dtype, @@ -164,6 +165,8 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: if len(self.decimal) != 1: raise ValueError("Only length-1 decimal markers supported") + @cache_readonly + def num(self) -> re.Pattern: decimal = re.escape(self.decimal) if self.thousands is None: regex = rf"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$" @@ -173,7 +176,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: rf"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" rf"([0-9]?(E|e)\-?[0-9]+)?$" ) - self.num = re.compile(regex) + return re.compile(regex) def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None: sep = self.delimiter From 8ff5dbc72a083d61f8cc6bb71738dba943675783 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 28 Apr 2023 15:19:10 -0700 Subject: [PATCH 3/8] REF: simplify parsers calls --- pandas/io/parsers/base_parser.py | 8 +++++--- pandas/io/parsers/c_parser_wrapper.py | 2 -- pandas/io/parsers/python_parser.py | 10 +++------- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 93f9609fa64c1..39e3cab3f212e 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1044,11 +1044,13 @@ def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, lis return index_names, columns, index_col - def _get_empty_meta( - self, columns, index_col, index_names, dtype: DtypeArg | None = None - ): + @final + def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): columns = list(columns) + index_col = self.index_col + index_names = self.index_names + # Convert `dtype` to a defaultdict of some kind. # This will enable us to write `dtype[col_name]` # without worrying about KeyError issues later on. diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 7ad16c9c61f01..ea0149eb05f58 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -245,8 +245,6 @@ def read( ) index, columns, col_dict = self._get_empty_meta( names, - self.index_col, - self.index_names, dtype=self.kwds.get("dtype"), ) columns = self._maybe_make_multi_index_columns(columns, self.col_names) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 3ab41991c8843..a273bb86e9477 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -149,9 +149,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: # multiple date column thing turning into a real spaghetti factory if not self._has_complex_date_col: - (index_names, self.orig_names, self.columns) = self._get_index_name( - self.columns - ) + (index_names, self.orig_names, self.columns) = self._get_index_name() self._name_processed = True if self.index_names is None: self.index_names = index_names @@ -273,11 +271,8 @@ def read( self.index_col, # type: ignore[has-type] ), ) - # error: Cannot determine type of 'index_col' index, columns, col_dict = self._get_empty_meta( names, - self.index_col, # type: ignore[has-type] - self.index_names, self.dtype, ) conv_columns = self._maybe_make_multi_index_columns(columns, self.col_names) @@ -914,7 +909,7 @@ def _clear_buffer(self) -> None: _implicit_index = False def _get_index_name( - self, columns: Sequence[Hashable] + self, ) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]: """ Try several cases to get lines: @@ -927,6 +922,7 @@ def _get_index_name( 1 lists index columns and row 0 lists normal columns. 2) Get index from the columns if it was listed. """ + columns: Sequence[Hashable] = self.columns orig_names = list(columns) columns = list(columns) From 5074a9033cada21779002c1f5d657f9e789967db Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 28 Apr 2023 15:24:41 -0700 Subject: [PATCH 4/8] use orig_names --- pandas/io/parsers/c_parser_wrapper.py | 1 + pandas/io/parsers/python_parser.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index ea0149eb05f58..70c1e9ef65582 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -246,6 +246,7 @@ def read( index, columns, col_dict = self._get_empty_meta( names, dtype=self.kwds.get("dtype"), + # TODO: can we pass self.dtype like in python_parser? ) columns = self._maybe_make_multi_index_columns(columns, self.col_names) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index a273bb86e9477..c3ac2c9cfd640 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -922,7 +922,7 @@ def _get_index_name( 1 lists index columns and row 0 lists normal columns. 2) Get index from the columns if it was listed. """ - columns: Sequence[Hashable] = self.columns + columns: Sequence[Hashable] = self.orig_names orig_names = list(columns) columns = list(columns) From e7fa45a08701d7a1fd30e788c127b44b10ca5b60 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 28 Apr 2023 15:41:49 -0700 Subject: [PATCH 5/8] simplify --- pandas/io/parsers/c_parser_wrapper.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 70c1e9ef65582..cb3629ed0af4e 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -245,8 +245,7 @@ def read( ) index, columns, col_dict = self._get_empty_meta( names, - dtype=self.kwds.get("dtype"), - # TODO: can we pass self.dtype like in python_parser? + dtype=self.dtype, ) columns = self._maybe_make_multi_index_columns(columns, self.col_names) From 268031ef877b09289ba7cf1ac3e730aa544bdbe3 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 28 Apr 2023 16:10:20 -0700 Subject: [PATCH 6/8] stronger typing sorta --- pandas/io/parsers/base_parser.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 39e3cab3f212e..e9f4288c38ed8 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -155,15 +155,19 @@ def __init__(self, kwds) -> None: # validate index_col that only contains integers if self.index_col is not None: - if not ( + # In this case we can pin down index_col as list[int] + if is_integer(self.index_col): + self.index_col = [self.index_col] + elif not ( is_list_like(self.index_col, allow_sets=False) and all(map(is_integer, self.index_col)) - or is_integer(self.index_col) ): raise ValueError( "index_col must only contain row numbers " "when specifying a multi-index header" ) + else: + self.index_col = list(self.index_col) self._name_processed = False From 25f88d50b337f8ed86fd8c525b38dc8c775e52d9 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 28 Apr 2023 17:42:34 -0700 Subject: [PATCH 7/8] TYP/REF: make Parsers less stateful --- pandas/io/parsers/arrow_parser_wrapper.py | 3 --- pandas/io/parsers/base_parser.py | 12 ++++++++++-- pandas/io/parsers/python_parser.py | 19 +++++++------------ 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index a8c1d1a7eeade..e106db224c3dc 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -36,9 +36,6 @@ def _parse_kwds(self): encoding: str | None = self.kwds.get("encoding") self.encoding = "utf-8" if encoding is None else encoding - self.usecols, self.usecols_dtype = self._validate_usecols_arg( - self.kwds["usecols"] - ) na_values = self.kwds["na_values"] if isinstance(na_values, dict): raise ValueError( diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index e9f4288c38ed8..16b3337006bc5 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -102,10 +102,17 @@ class BadLineHandleMethod(Enum): WARN = 1 SKIP = 2 - _implicit_index: bool = False + _implicit_index: bool _first_chunk: bool + keep_default_na: bool + dayfirst: bool + cache_dates: bool + keep_date_col: bool + usecols_dtype: str | None def __init__(self, kwds) -> None: + self._implicit_index = False + self.names = kwds.get("names") self.orig_names: Sequence[Hashable] | None = None @@ -962,6 +969,7 @@ def _validate_usecols_names(self, usecols, names: Sequence): return usecols + @final def _validate_usecols_arg(self, usecols): """ Validate the 'usecols' parameter. @@ -1325,7 +1333,7 @@ def _try_convert_dates( return new_name, new_col, colnames -def _get_na_values(col, na_values, na_fvalues, keep_default_na): +def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): """ Get the NaN values for a given column. diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index c3ac2c9cfd640..4e1bcf54c0ae9 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -66,6 +66,8 @@ class PythonParser(ParserBase): + _no_thousands_columns: set[int] + def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: """ Workhorse function for processing nested list into DataFrame @@ -98,8 +100,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: self.quoting = kwds["quoting"] self.skip_blank_lines = kwds["skip_blank_lines"] - self.names_passed = kwds["names"] or None - self.has_index_names = False if "has_index_names" in kwds: self.has_index_names = kwds["has_index_names"] @@ -117,7 +117,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: self.data = cast(Iterator[str], f) else: assert hasattr(f, "readline") - self._make_reader(f) + self.data = self._make_reader(f) # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. @@ -176,7 +176,7 @@ def num(self) -> re.Pattern: ) return re.compile(regex) - def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None: + def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]): sep = self.delimiter if sep is None or len(sep) == 1: @@ -238,10 +238,7 @@ def _read(): reader = _read() - # error: Incompatible types in assignment (expression has type "_reader", - # variable has type "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, - # TextIOWrapper, mmap, None]") - self.data = reader # type: ignore[assignment] + return reader def read( self, rows: int | None = None @@ -906,8 +903,6 @@ def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: def _clear_buffer(self) -> None: self.buf = [] - _implicit_index = False - def _get_index_name( self, ) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]: @@ -1316,8 +1311,8 @@ def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None: self.infer_nrows = kwds.pop("infer_nrows") PythonParser.__init__(self, f, **kwds) - def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None: - self.data = FixedWidthReader( + def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> FixedWidthReader: + return FixedWidthReader( f, self.colspecs, self.delimiter, From 1c358a3406a91123f2a52ee55c7ebb725af31b8f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 28 Apr 2023 18:02:43 -0700 Subject: [PATCH 8/8] final --- pandas/io/parsers/base_parser.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 16b3337006bc5..1ab7a1f38585b 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -439,6 +439,7 @@ def _get_name(icol): return index + @final def _clean_mapping(self, mapping): """converts col numbers to names""" if not isinstance(mapping, dict): @@ -667,6 +668,7 @@ def _set(x) -> int: return noconvert_columns + @final def _infer_types( self, values, na_values, no_dtype_specified, try_num_bool: bool = True ) -> tuple[ArrayLike, int]: @@ -771,6 +773,7 @@ def _infer_types( return result, na_count + @final def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: """ Cast values to specified type @@ -858,6 +861,7 @@ def _do_date_conversions( ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]: ... + @final def _do_date_conversions( self, names: Sequence[Hashable] | Index, @@ -879,6 +883,7 @@ def _do_date_conversions( return names, data + @final def _check_data_length( self, columns: Sequence[Hashable], @@ -922,6 +927,7 @@ def _evaluate_usecols( ) -> set[str]: ... + @final def _evaluate_usecols( self, usecols: Callable[[Hashable], object] | set[str] | set[int], @@ -938,6 +944,7 @@ def _evaluate_usecols( return {i for i, name in enumerate(names) if usecols(name)} return usecols + @final def _validate_usecols_names(self, usecols, names: Sequence): """ Validates that all usecols are present in a given @@ -1019,6 +1026,7 @@ def _validate_usecols_arg(self, usecols): return usecols, usecols_dtype return usecols, None + @final def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]: if not is_index_col(index_col): return None, columns, index_col