From 2963553954ba46dd8d570a5f4fbaf277637e4d93 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 28 Apr 2021 10:06:07 -0700 Subject: [PATCH 1/3] CLN: tighten typing in libparsing --- pandas/_libs/parsers.pyi | 4 +-- pandas/_libs/parsers.pyx | 43 ++++++++++++++++----------- pandas/io/parsers/c_parser_wrapper.py | 2 ++ 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi index 18ae23e7fb90d..92b970d47467e 100644 --- a/pandas/_libs/parsers.pyi +++ b/pandas/_libs/parsers.pyi @@ -31,8 +31,8 @@ class TextReader: source, delimiter: bytes | str = ..., # single-character only header=..., - header_start=..., - header_end=..., + header_start: int = ..., # int64_t + header_end: int = ..., # uint64_t index_col=..., names=..., tokenize_chunksize: int = ..., # int64_t diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 2abb7e0ea3ac2..8d9f1773590b0 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -101,13 +101,13 @@ from pandas.errors import ( from pandas.core.dtypes.common import ( is_bool_dtype, - is_categorical_dtype, is_datetime64_dtype, is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_object_dtype, ) +from pandas.core.dtypes.dtypes import CategoricalDtype cdef: float64_t INF = np.inf @@ -305,35 +305,36 @@ cdef class TextReader: object na_fvalues object true_values, false_values object handle + object orig_header bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns - uint64_t parser_start + bint mangle_dupe_cols, allow_leading_cols + uint64_t parser_start # this is modified after __init__ list clocks const char *encoding_errors kh_str_starts_t *false_set kh_str_starts_t *true_set + int64_t buffer_lines, skipfooter + list dtype_cast_order # list[np.dtype] + list names # can be None + set noconvert # set[int] cdef public: - int64_t leading_cols, table_width, skipfooter, buffer_lines - bint allow_leading_cols, mangle_dupe_cols - bint delim_whitespace + int64_t leading_cols, table_width object delimiter # bytes or str object converters object na_values - object orig_header, names, header_start, header_end list header # list[list[non-negative integers]] object index_col object skiprows object dtype object usecols - list dtype_cast_order # list[np.dtype] set unnamed_cols # set[str] - set noconvert # set[int] def __cinit__(self, source, delimiter=b',', # bytes | str header=0, - header_start=0, - header_end=0, + int64_t header_start=0, + uint64_t header_end=0, index_col=None, names=None, tokenize_chunksize=DEFAULT_CHUNKSIZE, @@ -457,7 +458,6 @@ cdef class TextReader: self.parser.warn_bad_lines = 0 self.delimiter = delimiter - self.delim_whitespace = delim_whitespace self.na_values = na_values if na_fvalues is None: @@ -502,7 +502,7 @@ cdef class TextReader: # header stuff self.allow_leading_cols = allow_leading_cols - self.leading_cols = 0 + self.leading_cols = 0 # updated in _get_header # TODO: no header vs. header is not the first row self.has_mi_columns = 0 @@ -535,10 +535,11 @@ cdef class TextReader: self.parser.header_end = header self.parser_start = header + 1 self.parser.header = header - prelim_header = [ header ] + prelim_header = [header] self.names = names header, table_width, unnamed_cols = self._get_header(prelim_header) + # header, table_width, and unnamed_cols are set here, never changed self.header = header self.table_width = table_width self.unnamed_cols = unnamed_cols @@ -618,6 +619,11 @@ cdef class TextReader: cdef _get_header(self, list prelim_header): # header is now a list of lists, so field_count should use header[0] + # + # modifies: + # self.parser attributes + # self.parser_start + # self.leading_cols cdef: Py_ssize_t i, start, field_count, passed_count, unnamed_count, level @@ -710,7 +716,7 @@ cdef class TextReader: header.append(this_header) if self.names is not None: - header = [ self.names ] + header = [self.names] elif self.names is not None: # Enforce this unless usecols @@ -721,7 +727,7 @@ cdef class TextReader: if self.parser.lines < 1: self._tokenize_rows(1) - header = [ self.names ] + header = [self.names] if self.parser.lines < 1: field_count = len(header[0]) @@ -778,7 +784,7 @@ cdef class TextReader: """ # Conserve intermediate space # Caller is responsible for concatenating chunks, - # see c_parser_wrapper._concatenatve_chunks + # see c_parser_wrapper._concatenate_chunks cdef: size_t rows_read = 0 list chunks = [] @@ -885,7 +891,7 @@ cdef class TextReader: cdef _start_clock(self): self.clocks.append(time.time()) - cdef _end_clock(self, what): + cdef _end_clock(self, str what): if self.verbose: elapsed = time.time() - self.clocks.pop(-1) print(f'{what} took: {elapsed * 1000:.2f} ms') @@ -1090,7 +1096,7 @@ cdef class TextReader: bint user_dtype, kh_str_starts_t *na_hashset, object na_flist): - if is_categorical_dtype(dtype): + if isinstance(dtype, CategoricalDtype): # TODO: I suspect that _categorical_convert could be # optimized when dtype is an instance of CategoricalDtype codes, cats, na_count = _categorical_convert( @@ -1205,6 +1211,7 @@ cdef class TextReader: return self.converters.get(i) cdef _get_na_list(self, Py_ssize_t i, name): + # Note: updates self.na_values, self.na_fvalues if self.na_values is None: return None, set() diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index fbf2a53207f75..e1ebc565466a1 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -28,6 +28,7 @@ class CParserWrapper(ParserBase): low_memory: bool + _reader: parsers.TextReader def __init__(self, src: FilePathOrBuffer, **kwds): self.kwds = kwds @@ -58,6 +59,7 @@ def __init__(self, src: FilePathOrBuffer, **kwds): except Exception: self.handles.close() raise + self.unnamed_cols = self._reader.unnamed_cols # error: Cannot determine type of 'names' From b4aba8d50a83191cc51256e70dc08e02bf9f193f Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 28 Apr 2021 10:13:30 -0700 Subject: [PATCH 2/3] TYP: base_parser --- pandas/io/parsers/base_parser.py | 50 ++++++++++++++++---------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index a011a789bf17c..2f65a06caf7a3 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -4,6 +4,7 @@ import itertools from typing import ( Any, + Callable, DefaultDict, Dict, Iterable, @@ -27,6 +28,7 @@ from pandas._typing import ( DtypeArg, FilePathOrBuffer, + final, ) from pandas.errors import ( ParserError, @@ -114,6 +116,8 @@ class ParserBase: + _implicit_index: bool = False + def __init__(self, kwds): self.names = kwds.get("names") @@ -268,15 +272,17 @@ def close(self): if self.handles is not None: self.handles.close() + @final @property - def _has_complex_date_col(self): + def _has_complex_date_col(self) -> bool: return isinstance(self.parse_dates, dict) or ( isinstance(self.parse_dates, list) and len(self.parse_dates) > 0 and isinstance(self.parse_dates[0], list) ) - def _should_parse_dates(self, i): + @final + def _should_parse_dates(self, i: int) -> bool: if isinstance(self.parse_dates, bool): return self.parse_dates else: @@ -415,8 +421,6 @@ def _make_index(self, data, alldata, columns, indexnamerow=False): return index, columns - _implicit_index = False - def _get_simple_index(self, data, columns): def ix(col): if not isinstance(col, str): @@ -466,7 +470,8 @@ def _get_name(icol): return index - def _agg_index(self, index, try_parse_dates=True) -> Index: + @final + def _agg_index(self, index, try_parse_dates: bool = True) -> Index: arrays = [] for i, arr in enumerate(index): @@ -497,8 +502,15 @@ def _agg_index(self, index, try_parse_dates=True) -> Index: return index + @final def _convert_to_ndarrays( - self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None + self, + dct: dict, + na_values, + na_fvalues, + verbose: bool = False, + converters=None, + dtypes=None, ): result = {} for c, values in dct.items(): @@ -1010,12 +1022,12 @@ def converter(*date_cols): def _process_date_conversion( data_dict, - converter, + converter: Callable, parse_spec, index_col, index_names, columns, - keep_date_col=False, + keep_date_col: bool = False, ): def _isindex(colspec): return (isinstance(index_col, list) and colspec in index_col) or ( @@ -1077,7 +1089,7 @@ def _isindex(colspec): return data_dict, new_cols -def _try_convert_dates(parser, colspec, data_dict, columns): +def _try_convert_dates(parser: Callable, colspec, data_dict, columns): colset = set(columns) colnames = [] @@ -1131,21 +1143,9 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na): return na_values, na_fvalues -# Seems to be unused -def _get_col_names(colspec, columns): - colset = set(columns) - colnames = [] - for c in colspec: - if c in colset: - colnames.append(c) - elif isinstance(c, int): - colnames.append(columns[c]) - return colnames - - def _is_potential_multi_index( columns, index_col: Optional[Union[bool, Sequence[int]]] = None -): +) -> bool: """ Check whether or not the `columns` parameter could be converted into a MultiIndex. @@ -1159,12 +1159,12 @@ def _is_potential_multi_index( Returns ------- - boolean : Whether or not columns could become a MultiIndex + bool : Whether or not columns could become a MultiIndex """ if index_col is None or isinstance(index_col, bool): index_col = [] - return ( + return bool( len(columns) and not isinstance(columns, MultiIndex) and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) @@ -1193,5 +1193,5 @@ def _validate_parse_dates_arg(parse_dates): return parse_dates -def is_index_col(col): +def is_index_col(col) -> bool: return col is not None and col is not False From 14dc3ff84d9127c2e9aed1102ff31328cf38f406 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 28 Apr 2021 10:20:51 -0700 Subject: [PATCH 3/3] TYP: parsers --- pandas/io/parsers/base_parser.py | 10 +++++++++- pandas/io/parsers/c_parser_wrapper.py | 5 ++--- pandas/io/parsers/python_parser.py | 5 ++--- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 2f65a06caf7a3..8ab845868285c 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -117,6 +117,7 @@ class ParserBase: _implicit_index: bool = False + _first_chunk: bool def __init__(self, kwds): @@ -301,8 +302,9 @@ def _should_parse_dates(self, i: int) -> bool: name is not None and name in self.parse_dates ) + @final def _extract_multi_indexer_columns( - self, header, index_names, col_names, passed_names=False + self, header, index_names, col_names, passed_names: bool = False ): """ extract and return the names, index_names, col_names @@ -360,6 +362,7 @@ def extract(r): return names, index_names, col_names, passed_names + @final def _maybe_dedup_names(self, names): # see gh-7160 and gh-9424: this helps to provide # immediate alleviation of the duplicate names @@ -388,12 +391,14 @@ def _maybe_dedup_names(self, names): return names + @final def _maybe_make_multi_index_columns(self, columns, col_names=None): # possibly create a column mi here if _is_potential_multi_index(columns): columns = MultiIndex.from_tuples(columns, names=col_names) return columns + @final def _make_index(self, data, alldata, columns, indexnamerow=False): if not is_index_col(self.index_col) or not self.index_col: index = None @@ -421,6 +426,7 @@ def _make_index(self, data, alldata, columns, indexnamerow=False): return index, columns + @final def _get_simple_index(self, data, columns): def ix(col): if not isinstance(col, str): @@ -443,6 +449,7 @@ def ix(col): return index + @final def _get_complex_date_index(self, data, col_names): def _get_name(icol): if isinstance(icol, str): @@ -587,6 +594,7 @@ def _convert_to_ndarrays( print(f"Filled {na_count} NA values in column {c!s}") return result + @final def _set_noconvert_dtype_columns( self, col_indices: List[int], names: List[Union[int, str, Tuple]] ) -> Set[int]: diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index e1ebc565466a1..7a0e704d2fbc4 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -219,8 +219,7 @@ def read(self, nrows=None): else: data = self._reader.read(nrows) except StopIteration: - # error: Cannot determine type of '_first_chunk' - if self._first_chunk: # type: ignore[has-type] + if self._first_chunk: self._first_chunk = False names = self._maybe_dedup_names(self.orig_names) index, columns, col_dict = self._get_empty_meta( @@ -324,7 +323,7 @@ def _get_index_names(self): return names, idx_names - def _maybe_parse_dates(self, values, index: int, try_parse_dates=True): + def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): if try_parse_dates and self._should_parse_dates(index): values = self._date_conv(values) return values diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index a6d38eab99977..0055f3123f3c0 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -250,8 +250,7 @@ def read(self, rows=None): try: content = self._get_lines(rows) except StopIteration: - # error: Cannot determine type of '_first_chunk' - if self._first_chunk: # type: ignore[has-type] + if self._first_chunk: content = [] else: self.close() @@ -1195,7 +1194,7 @@ def count_empty_vals(vals) -> int: return sum(1 for v in vals if v == "" or v is None) -def _validate_skipfooter_arg(skipfooter): +def _validate_skipfooter_arg(skipfooter: int) -> int: """ Validate the 'skipfooter' parameter.