pandas-dev · jreback · May 5, 2021 · Apr 28, 2021 · Apr 28, 2021 · Apr 28, 2021
diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi
@@ -31,8 +31,8 @@ class TextReader:
         source,
         delimiter: bytes | str = ...,  # single-character only
         header=...,
-        header_start=...,
-        header_end=...,
+        header_start: int = ...,  # int64_t
+        header_end: int = ...,  # uint64_t
         index_col=...,
         names=...,
         tokenize_chunksize: int = ...,  # int64_t

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -101,13 +101,13 @@ from pandas.errors import (
 
 from pandas.core.dtypes.common import (
     is_bool_dtype,
-    is_categorical_dtype,
     is_datetime64_dtype,
     is_extension_array_dtype,
     is_float_dtype,
     is_integer_dtype,
     is_object_dtype,
 )
+from pandas.core.dtypes.dtypes import CategoricalDtype
 
 cdef:
     float64_t INF = <float64_t>np.inf
@@ -305,35 +305,36 @@ cdef class TextReader:
         object na_fvalues
         object true_values, false_values
         object handle
+        object orig_header
         bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
-        uint64_t parser_start
+        bint mangle_dupe_cols, allow_leading_cols
+        uint64_t parser_start  # this is modified after __init__
         list clocks
         const char *encoding_errors
         kh_str_starts_t *false_set
         kh_str_starts_t *true_set
+        int64_t buffer_lines, skipfooter
+        list dtype_cast_order  # list[np.dtype]
+        list names   # can be None
+        set noconvert  # set[int]
 
     cdef public:
-        int64_t leading_cols, table_width, skipfooter, buffer_lines
-        bint allow_leading_cols, mangle_dupe_cols
-        bint delim_whitespace
+        int64_t leading_cols, table_width
         object delimiter  # bytes or str
         object converters
         object na_values
-        object orig_header, names, header_start, header_end
         list header  # list[list[non-negative integers]]
         object index_col
         object skiprows
         object dtype
         object usecols
-        list dtype_cast_order  # list[np.dtype]
         set unnamed_cols  # set[str]
-        set noconvert  # set[int]
 
     def __cinit__(self, source,
                   delimiter=b',',  # bytes | str
                   header=0,
-                  header_start=0,
-                  header_end=0,
+                  int64_t header_start=0,
+                  uint64_t header_end=0,
                   index_col=None,
                   names=None,
                   tokenize_chunksize=DEFAULT_CHUNKSIZE,
@@ -457,7 +458,6 @@ cdef class TextReader:
             self.parser.warn_bad_lines = 0
 
         self.delimiter = delimiter
-        self.delim_whitespace = delim_whitespace
 
         self.na_values = na_values
         if na_fvalues is None:
@@ -502,7 +502,7 @@ cdef class TextReader:
         # header stuff
 
         self.allow_leading_cols = allow_leading_cols
-        self.leading_cols = 0
+        self.leading_cols = 0  # updated in _get_header
 
         # TODO: no header vs. header is not the first row
         self.has_mi_columns = 0
@@ -535,10 +535,11 @@ cdef class TextReader:
                 self.parser.header_end = header
                 self.parser_start = header + 1
                 self.parser.header = header
-                prelim_header = [ header ]
+                prelim_header = [header]
 
         self.names = names
         header, table_width, unnamed_cols = self._get_header(prelim_header)
+        # header, table_width, and unnamed_cols are set here, never changed
         self.header = header
         self.table_width = table_width
         self.unnamed_cols = unnamed_cols
@@ -618,6 +619,11 @@ cdef class TextReader:
 
     cdef _get_header(self, list prelim_header):
         # header is now a list of lists, so field_count should use header[0]
+        #
+        # modifies:
+        #   self.parser attributes
+        #   self.parser_start
+        #   self.leading_cols
 
         cdef:
             Py_ssize_t i, start, field_count, passed_count, unnamed_count, level
@@ -710,7 +716,7 @@ cdef class TextReader:
                 header.append(this_header)
 
             if self.names is not None:
-                header = [ self.names ]
+                header = [self.names]
 
         elif self.names is not None:
             # Enforce this unless usecols
@@ -721,7 +727,7 @@ cdef class TextReader:
             if self.parser.lines < 1:
                 self._tokenize_rows(1)
 
-            header = [ self.names ]
+            header = [self.names]
 
             if self.parser.lines < 1:
                 field_count = len(header[0])
@@ -778,7 +784,7 @@ cdef class TextReader:
         """
         # Conserve intermediate space
         # Caller is responsible for concatenating chunks,
-        #  see c_parser_wrapper._concatenatve_chunks
+        #  see c_parser_wrapper._concatenate_chunks
         cdef:
             size_t rows_read = 0
             list chunks = []
@@ -885,7 +891,7 @@ cdef class TextReader:
     cdef _start_clock(self):
         self.clocks.append(time.time())
 
-    cdef _end_clock(self, what):
+    cdef _end_clock(self, str what):
         if self.verbose:
             elapsed = time.time() - self.clocks.pop(-1)
             print(f'{what} took: {elapsed * 1000:.2f} ms')
@@ -1090,7 +1096,7 @@ cdef class TextReader:
                              bint user_dtype,
                              kh_str_starts_t *na_hashset,
                              object na_flist):
-        if is_categorical_dtype(dtype):
+        if isinstance(dtype, CategoricalDtype):
             # TODO: I suspect that _categorical_convert could be
             # optimized when dtype is an instance of CategoricalDtype
             codes, cats, na_count = _categorical_convert(
@@ -1205,6 +1211,7 @@ cdef class TextReader:
         return self.converters.get(i)
 
     cdef _get_na_list(self, Py_ssize_t i, name):
+        # Note: updates self.na_values, self.na_fvalues
         if self.na_values is None:
             return None, set()
 

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -4,6 +4,7 @@
 import itertools
 from typing import (
     Any,
+    Callable,
     DefaultDict,
     Dict,
     Iterable,
@@ -27,6 +28,7 @@
 from pandas._typing import (
     DtypeArg,
     FilePathOrBuffer,
+    final,
 )
 from pandas.errors import (
     ParserError,
@@ -114,6 +116,9 @@
 
 
 class ParserBase:
+    _implicit_index: bool = False
+    _first_chunk: bool
+
     def __init__(self, kwds):
 
         self.names = kwds.get("names")
@@ -268,15 +273,17 @@ def close(self):
         if self.handles is not None:
             self.handles.close()
 
+    @final
     @property
-    def _has_complex_date_col(self):
+    def _has_complex_date_col(self) -> bool:
         return isinstance(self.parse_dates, dict) or (
             isinstance(self.parse_dates, list)
             and len(self.parse_dates) > 0
             and isinstance(self.parse_dates[0], list)
         )
 
-    def _should_parse_dates(self, i):
+    @final
+    def _should_parse_dates(self, i: int) -> bool:
         if isinstance(self.parse_dates, bool):
             return self.parse_dates
         else:
@@ -295,8 +302,9 @@ def _should_parse_dates(self, i):
                     name is not None and name in self.parse_dates
                 )
 
+    @final
     def _extract_multi_indexer_columns(
-        self, header, index_names, col_names, passed_names=False
+        self, header, index_names, col_names, passed_names: bool = False
     ):
         """
         extract and return the names, index_names, col_names
@@ -354,6 +362,7 @@ def extract(r):
 
         return names, index_names, col_names, passed_names
 
+    @final
     def _maybe_dedup_names(self, names):
         # see gh-7160 and gh-9424: this helps to provide
         # immediate alleviation of the duplicate names
@@ -382,12 +391,14 @@ def _maybe_dedup_names(self, names):
 
         return names
 
+    @final
     def _maybe_make_multi_index_columns(self, columns, col_names=None):
         # possibly create a column mi here
         if _is_potential_multi_index(columns):
             columns = MultiIndex.from_tuples(columns, names=col_names)
         return columns
 
+    @final
     def _make_index(self, data, alldata, columns, indexnamerow=False):
         if not is_index_col(self.index_col) or not self.index_col:
             index = None
@@ -415,8 +426,7 @@ def _make_index(self, data, alldata, columns, indexnamerow=False):
 
         return index, columns
 
-    _implicit_index = False
-
+    @final
     def _get_simple_index(self, data, columns):
         def ix(col):
             if not isinstance(col, str):
@@ -439,6 +449,7 @@ def ix(col):
 
         return index
 
+    @final
     def _get_complex_date_index(self, data, col_names):
         def _get_name(icol):
             if isinstance(icol, str):
@@ -466,7 +477,8 @@ def _get_name(icol):
 
         return index
 
-    def _agg_index(self, index, try_parse_dates=True) -> Index:
+    @final
+    def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
         arrays = []
 
         for i, arr in enumerate(index):
@@ -497,8 +509,15 @@ def _agg_index(self, index, try_parse_dates=True) -> Index:
 
         return index
 
+    @final
     def _convert_to_ndarrays(
-        self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None
+        self,
+        dct: dict,
+        na_values,
+        na_fvalues,
+        verbose: bool = False,
+        converters=None,
+        dtypes=None,
     ):
         result = {}
         for c, values in dct.items():
@@ -575,6 +594,7 @@ def _convert_to_ndarrays(
                 print(f"Filled {na_count} NA values in column {c!s}")
         return result
 
+    @final
     def _set_noconvert_dtype_columns(
         self, col_indices: List[int], names: List[Union[int, str, Tuple]]
     ) -> Set[int]:
@@ -1010,12 +1030,12 @@ def converter(*date_cols):
 
 def _process_date_conversion(
     data_dict,
-    converter,
+    converter: Callable,
     parse_spec,
     index_col,
     index_names,
     columns,
-    keep_date_col=False,
+    keep_date_col: bool = False,
 ):
     def _isindex(colspec):
         return (isinstance(index_col, list) and colspec in index_col) or (
@@ -1077,7 +1097,7 @@ def _isindex(colspec):
     return data_dict, new_cols
 
 
-def _try_convert_dates(parser, colspec, data_dict, columns):
+def _try_convert_dates(parser: Callable, colspec, data_dict, columns):
     colset = set(columns)
     colnames = []
 
@@ -1131,21 +1151,9 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na):
         return na_values, na_fvalues
 
 
-# Seems to be unused
-def _get_col_names(colspec, columns):
-    colset = set(columns)
-    colnames = []
-    for c in colspec:
-        if c in colset:
-            colnames.append(c)
-        elif isinstance(c, int):
-            colnames.append(columns[c])
-    return colnames
-
-
 def _is_potential_multi_index(
     columns, index_col: Optional[Union[bool, Sequence[int]]] = None
-):
+) -> bool:
     """
     Check whether or not the `columns` parameter
     could be converted into a MultiIndex.
@@ -1159,12 +1167,12 @@ def _is_potential_multi_index(
 
     Returns
     -------
-    boolean : Whether or not columns could become a MultiIndex
+    bool : Whether or not columns could become a MultiIndex
     """
     if index_col is None or isinstance(index_col, bool):
         index_col = []
 
-    return (
+    return bool(
         len(columns)
         and not isinstance(columns, MultiIndex)
         and all(isinstance(c, tuple) for c in columns if c not in list(index_col))
@@ -1193,5 +1201,5 @@ def _validate_parse_dates_arg(parse_dates):
     return parse_dates
 
 
-def is_index_col(col):
+def is_index_col(col) -> bool:
     return col is not None and col is not False