From 40da541e0b806a58aaea48a62b5a60840615e98d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 6 Jul 2024 10:16:01 -0700 Subject: [PATCH 1/6] CLean up index methods --- pandas/io/parsers/base_parser.py | 54 +++++++++++++------------------- 1 file changed, 21 insertions(+), 33 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index e8faea76897c6..55acdc688a2a2 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -274,46 +274,34 @@ def _make_index( self, data, alldata, columns, indexnamerow: list[Scalar] | None = None ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]: index: Index | None - if not is_index_col(self.index_col) or not self.index_col: - index = None + if isinstance(self.index_col, list) and len(self.index_col): + to_remove = [] + index = [] + for idx in self.index_col: + if isinstance(idx, str): + raise ValueError(f"Index {idx} invalid") + to_remove.append(idx) + index.append(alldata[idx]) + # remove index items from content and columns, don't pop in + # loop + for i in sorted(to_remove, reverse=True): + alldata.pop(i) + if not self._implicit_index: + columns.pop(i) + index = self._agg_index(index) + + # add names for the index + if indexnamerow: + coffset = len(indexnamerow) - len(columns) + index = index.set_names(indexnamerow[:coffset]) else: - simple_index = self._get_simple_index(alldata, columns) - index = self._agg_index(simple_index) - - # add names for the index - if indexnamerow: - coffset = len(indexnamerow) - len(columns) - assert index is not None - index = index.set_names(indexnamerow[:coffset]) + index = None # maybe create a mi on the columns columns = self._maybe_make_multi_index_columns(columns, self.col_names) return index, columns - @final - def _get_simple_index(self, data, columns): - def ix(col): - if not isinstance(col, str): - return col - raise ValueError(f"Index {col} invalid") - - to_remove = [] - index = [] - for idx in self.index_col: - i = ix(idx) - to_remove.append(i) - index.append(data[i]) - - # remove index items from content and columns, don't pop in - # loop - for i in sorted(to_remove, reverse=True): - data.pop(i) - if not self._implicit_index: - columns.pop(i) - - return index - @final def _clean_mapping(self, mapping): """converts col numbers to names""" From 7664117392da4d4875daf0a1ca9fd8bbf49f165e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 6 Jul 2024 10:24:18 -0700 Subject: [PATCH 2/6] Remove unused try_parse_dates --- pandas/io/parsers/base_parser.py | 4 ++-- pandas/io/parsers/c_parser_wrapper.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 55acdc688a2a2..674d61d8efb41 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -321,12 +321,12 @@ def _clean_mapping(self, mapping): return clean @final - def _agg_index(self, index, try_parse_dates: bool = True) -> Index: + def _agg_index(self, index) -> Index: arrays = [] converters = self._clean_mapping(self.converters) for i, arr in enumerate(index): - if try_parse_dates and self._should_parse_dates(i): + if self._should_parse_dates(i): arr = date_converter( arr, col=self.index_names[i] if self.index_names is not None else None, diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index b59a778624c49..d408d73729a90 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -290,7 +290,7 @@ def read( else: values = data.pop(self.index_col[i]) - values = self._maybe_parse_dates(values, i, try_parse_dates=True) + values = self._maybe_parse_dates(values, i) arrays.append(values) index = ensure_index_from_sequences(arrays) @@ -344,8 +344,8 @@ def _filter_usecols(self, names: SequenceT) -> SequenceT | list[Hashable]: ] return names - def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): - if try_parse_dates and self._should_parse_dates(index): + def _maybe_parse_dates(self, values, index: int): + if self._should_parse_dates(index): values = date_converter( values, col=self.index_names[index] if self.index_names is not None else None, From 13952f8446e6dba283737edb1d90f7c36db44365 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 6 Jul 2024 11:15:13 -0700 Subject: [PATCH 3/6] Clean usecol and date processing --- pandas/io/parsers/base_parser.py | 61 +++++++++++++-------------- pandas/io/parsers/c_parser_wrapper.py | 49 ++++++++++----------- pandas/io/parsers/python_parser.py | 3 +- 3 files changed, 54 insertions(+), 59 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 674d61d8efb41..1dafe6b7aea2c 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -324,6 +324,7 @@ def _clean_mapping(self, mapping): def _agg_index(self, index) -> Index: arrays = [] converters = self._clean_mapping(self.converters) + clean_dtypes = self._clean_mapping(self.dtype) for i, arr in enumerate(index): if self._should_parse_dates(i): @@ -352,8 +353,6 @@ def _agg_index(self, index) -> Index: else: col_na_values, col_na_fvalues = set(), set() - clean_dtypes = self._clean_mapping(self.dtype) - cast_type = None index_converter = False if self.index_names is not None: @@ -620,35 +619,6 @@ def _check_data_length( stacklevel=find_stack_level(), ) - @overload - def _evaluate_usecols( - self, - usecols: Callable[[Hashable], object], - names: Iterable[Hashable], - ) -> set[int]: ... - - @overload - def _evaluate_usecols( - self, usecols: SequenceT, names: Iterable[Hashable] - ) -> SequenceT: ... - - @final - def _evaluate_usecols( - self, - usecols: Callable[[Hashable], object] | SequenceT, - names: Iterable[Hashable], - ) -> SequenceT | set[int]: - """ - Check whether or not the 'usecols' parameter - is a callable. If so, enumerates the 'names' - parameter and returns a set of indices for - each entry in 'names' that evaluates to True. - If not a callable, returns 'usecols'. - """ - if callable(usecols): - return {i for i, name in enumerate(names) if usecols(name)} - return usecols - @final def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> SequenceT: """ @@ -976,3 +946,32 @@ def _validate_usecols_arg(usecols): return usecols, usecols_dtype return usecols, None + + +@overload +def evaluate_callable_usecols( + usecols: Callable[[Hashable], object], + names: Iterable[Hashable], +) -> set[int]: ... + + +@overload +def evaluate_callable_usecols( + usecols: SequenceT, names: Iterable[Hashable] +) -> SequenceT: ... + + +def evaluate_callable_usecols( + usecols: Callable[[Hashable], object] | SequenceT, + names: Iterable[Hashable], +) -> SequenceT | set[int]: + """ + Check whether or not the 'usecols' parameter + is a callable. If so, enumerates the 'names' + parameter and returns a set of indices for + each entry in 'names' that evaluates to True. + If not a callable, returns 'usecols'. + """ + if callable(usecols): + return {i for i, name in enumerate(names) if usecols(name)} + return usecols diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index d408d73729a90..abb6451f2594d 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -31,6 +31,7 @@ ParserBase, ParserError, date_converter, + evaluate_callable_usecols, is_index_col, validate_parse_dates_presence, ) @@ -133,7 +134,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: self.orig_names = self.names[:] # type: ignore[has-type] if self.usecols: - usecols = self._evaluate_usecols(self.usecols, self.orig_names) + usecols = evaluate_callable_usecols(self.usecols, self.orig_names) # GH 14671 # assert for mypy, orig_names is List or None, None would error in issubset @@ -256,8 +257,7 @@ def read( columns, self.col_names ) - if self.usecols is not None: - columns = self._filter_usecols(columns) + columns = _filter_usecols(self.usecols, columns) col_dict = {k: v for k, v in col_dict.items() if k in columns} @@ -290,13 +290,21 @@ def read( else: values = data.pop(self.index_col[i]) - values = self._maybe_parse_dates(values, i) + if self._should_parse_dates(i): + values = date_converter( + values, + col=self.index_names[index] + if self.index_names is not None + else None, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, + ) arrays.append(values) index = ensure_index_from_sequences(arrays) - if self.usecols is not None: - names = self._filter_usecols(names) + names = _filter_usecols(self.usecols, names) names = dedup_names(names, is_potential_multi_index(names, self.index_col)) @@ -320,8 +328,7 @@ def read( names = list(self.orig_names) names = dedup_names(names, is_potential_multi_index(names, self.index_col)) - if self.usecols is not None: - names = self._filter_usecols(names) + names = _filter_usecols(self.usecols, names) # columns as list alldata = [x[1] for x in data_tups] @@ -335,25 +342,13 @@ def read( return index, column_names, date_data - def _filter_usecols(self, names: SequenceT) -> SequenceT | list[Hashable]: - # hackish - usecols = self._evaluate_usecols(self.usecols, names) - if usecols is not None and len(names) != len(usecols): - return [ - name for i, name in enumerate(names) if i in usecols or name in usecols - ] - return names - - def _maybe_parse_dates(self, values, index: int): - if self._should_parse_dates(index): - values = date_converter( - values, - col=self.index_names[index] if self.index_names is not None else None, - dayfirst=self.dayfirst, - cache_dates=self.cache_dates, - date_format=self.date_format, - ) - return values + +def _filter_usecols(usecols, names: SequenceT) -> SequenceT | list[Hashable]: + # hackish + usecols = evaluate_callable_usecols(usecols, names) + if usecols is not None and len(names) != len(usecols): + return [name for i, name in enumerate(names) if i in usecols or name in usecols] + return names def _concatenate_chunks( diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 05fe963e9b2b7..172821dd076c7 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -59,6 +59,7 @@ ) from pandas.io.parsers.base_parser import ( ParserBase, + evaluate_callable_usecols, get_na_values, parser_defaults, validate_parse_dates_presence, @@ -774,7 +775,7 @@ def _handle_usecols( col_indices: set[int] | list[int] if self.usecols is not None: if callable(self.usecols): - col_indices = self._evaluate_usecols(self.usecols, usecols_key) + col_indices = evaluate_callable_usecols(self.usecols, usecols_key) elif any(isinstance(u, str) for u in self.usecols): if len(columns) > 1: raise ValueError( From 822f3b9e934c4799ea2730e4dbe26a7e3d220d33 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 6 Jul 2024 19:05:51 -0700 Subject: [PATCH 4/6] Clean clear buffer --- pandas/io/parsers/python_parser.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 172821dd076c7..30ac6ad817695 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -606,7 +606,7 @@ def _infer_columns( # serve as the 'line' for parsing if have_mi_columns and hr > 0: if clear_buffer: - self._clear_buffer() + self.buf.clear() columns.append([None] * len(columns[-1])) return columns, num_original_columns, unnamed_cols @@ -688,7 +688,7 @@ def _infer_columns( num_original_columns = len(this_columns) if clear_buffer: - self._clear_buffer() + self.buf.clear() first_line: list[Scalar] | None if names is not None: @@ -1095,9 +1095,6 @@ def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: lines=lines, search=self.decimal, replace="." ) - def _clear_buffer(self) -> None: - self.buf = [] - def _get_index_name( self, ) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]: From 912967586b3730c2352db0de09cf62330c827b8d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 7 Jul 2024 12:25:49 -0700 Subject: [PATCH 5/6] remove some single use --- pandas/io/parsers/python_parser.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 30ac6ad817695..c445529a6db48 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -128,9 +128,8 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: self.quoting = kwds["quoting"] self.skip_blank_lines = kwds["skip_blank_lines"] - self.has_index_names = False - if "has_index_names" in kwds: - self.has_index_names = kwds["has_index_names"] + # Passed from read_excel + self.has_index_names = kwds.get("has_index_names", False) self.thousands = kwds["thousands"] self.decimal = kwds["decimal"] @@ -300,9 +299,10 @@ def read( return index, conv_columns, col_dict # handle new style for names in index - count_empty_content_vals = count_empty_vals(content[0]) indexnamerow = None - if self.has_index_names and count_empty_content_vals == len(columns): + if self.has_index_names and sum( + int(v == "" or v is None) for v in content[0] + ) == len(columns): indexnamerow = content[0] content = content[1:] @@ -1524,10 +1524,6 @@ def _remove_empty_lines(self, lines: list[list[T]]) -> list[list[T]]: ] -def count_empty_vals(vals) -> int: - return sum(1 for v in vals if v == "" or v is None) - - def _validate_skipfooter_arg(skipfooter: int) -> int: """ Validate the 'skipfooter' parameter. From 3a0a03580b243aedcc6fe4abd1beabf88e79ee9e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 8 Jul 2024 09:18:19 -0700 Subject: [PATCH 6/6] Typing --- pandas/io/parsers/base_parser.py | 6 +++--- pandas/io/parsers/c_parser_wrapper.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 1dafe6b7aea2c..719afe160614f 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -276,19 +276,19 @@ def _make_index( index: Index | None if isinstance(self.index_col, list) and len(self.index_col): to_remove = [] - index = [] + indexes = [] for idx in self.index_col: if isinstance(idx, str): raise ValueError(f"Index {idx} invalid") to_remove.append(idx) - index.append(alldata[idx]) + indexes.append(alldata[idx]) # remove index items from content and columns, don't pop in # loop for i in sorted(to_remove, reverse=True): alldata.pop(i) if not self._implicit_index: columns.pop(i) - index = self._agg_index(index) + index = self._agg_index(indexes) # add names for the index if indexnamerow: diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index abb6451f2594d..f4198ac2a1443 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -293,7 +293,7 @@ def read( if self._should_parse_dates(i): values = date_converter( values, - col=self.index_names[index] + col=self.index_names[i] if self.index_names is not None else None, dayfirst=self.dayfirst,