Skip to content

TYP: io.parsers #41201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pandas/_libs/parsers.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ class TextReader:
source,
delimiter: bytes | str = ..., # single-character only
header=...,
header_start=...,
header_end=...,
header_start: int = ..., # int64_t
header_end: int = ..., # uint64_t
index_col=...,
names=...,
tokenize_chunksize: int = ..., # int64_t
Expand Down
43 changes: 25 additions & 18 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,13 @@ from pandas.errors import (

from pandas.core.dtypes.common import (
is_bool_dtype,
is_categorical_dtype,
is_datetime64_dtype,
is_extension_array_dtype,
is_float_dtype,
is_integer_dtype,
is_object_dtype,
)
from pandas.core.dtypes.dtypes import CategoricalDtype

cdef:
float64_t INF = <float64_t>np.inf
Expand Down Expand Up @@ -305,35 +305,36 @@ cdef class TextReader:
object na_fvalues
object true_values, false_values
object handle
object orig_header
bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
uint64_t parser_start
bint mangle_dupe_cols, allow_leading_cols
uint64_t parser_start # this is modified after __init__
list clocks
const char *encoding_errors
kh_str_starts_t *false_set
kh_str_starts_t *true_set
int64_t buffer_lines, skipfooter
list dtype_cast_order # list[np.dtype]
list names # can be None
set noconvert # set[int]

cdef public:
int64_t leading_cols, table_width, skipfooter, buffer_lines
bint allow_leading_cols, mangle_dupe_cols
bint delim_whitespace
int64_t leading_cols, table_width
object delimiter # bytes or str
object converters
object na_values
object orig_header, names, header_start, header_end
list header # list[list[non-negative integers]]
object index_col
object skiprows
object dtype
object usecols
list dtype_cast_order # list[np.dtype]
set unnamed_cols # set[str]
set noconvert # set[int]

def __cinit__(self, source,
delimiter=b',', # bytes | str
header=0,
header_start=0,
header_end=0,
int64_t header_start=0,
uint64_t header_end=0,
index_col=None,
names=None,
tokenize_chunksize=DEFAULT_CHUNKSIZE,
Expand Down Expand Up @@ -457,7 +458,6 @@ cdef class TextReader:
self.parser.warn_bad_lines = 0

self.delimiter = delimiter
self.delim_whitespace = delim_whitespace

self.na_values = na_values
if na_fvalues is None:
Expand Down Expand Up @@ -502,7 +502,7 @@ cdef class TextReader:
# header stuff

self.allow_leading_cols = allow_leading_cols
self.leading_cols = 0
self.leading_cols = 0 # updated in _get_header

# TODO: no header vs. header is not the first row
self.has_mi_columns = 0
Expand Down Expand Up @@ -535,10 +535,11 @@ cdef class TextReader:
self.parser.header_end = header
self.parser_start = header + 1
self.parser.header = header
prelim_header = [ header ]
prelim_header = [header]

self.names = names
header, table_width, unnamed_cols = self._get_header(prelim_header)
# header, table_width, and unnamed_cols are set here, never changed
self.header = header
self.table_width = table_width
self.unnamed_cols = unnamed_cols
Expand Down Expand Up @@ -618,6 +619,11 @@ cdef class TextReader:

cdef _get_header(self, list prelim_header):
# header is now a list of lists, so field_count should use header[0]
#
# modifies:
# self.parser attributes
# self.parser_start
# self.leading_cols

cdef:
Py_ssize_t i, start, field_count, passed_count, unnamed_count, level
Expand Down Expand Up @@ -710,7 +716,7 @@ cdef class TextReader:
header.append(this_header)

if self.names is not None:
header = [ self.names ]
header = [self.names]

elif self.names is not None:
# Enforce this unless usecols
Expand All @@ -721,7 +727,7 @@ cdef class TextReader:
if self.parser.lines < 1:
self._tokenize_rows(1)

header = [ self.names ]
header = [self.names]

if self.parser.lines < 1:
field_count = len(header[0])
Expand Down Expand Up @@ -778,7 +784,7 @@ cdef class TextReader:
"""
# Conserve intermediate space
# Caller is responsible for concatenating chunks,
# see c_parser_wrapper._concatenatve_chunks
# see c_parser_wrapper._concatenate_chunks
cdef:
size_t rows_read = 0
list chunks = []
Expand Down Expand Up @@ -885,7 +891,7 @@ cdef class TextReader:
cdef _start_clock(self):
self.clocks.append(time.time())

cdef _end_clock(self, what):
cdef _end_clock(self, str what):
if self.verbose:
elapsed = time.time() - self.clocks.pop(-1)
print(f'{what} took: {elapsed * 1000:.2f} ms')
Expand Down Expand Up @@ -1090,7 +1096,7 @@ cdef class TextReader:
bint user_dtype,
kh_str_starts_t *na_hashset,
object na_flist):
if is_categorical_dtype(dtype):
if isinstance(dtype, CategoricalDtype):
# TODO: I suspect that _categorical_convert could be
# optimized when dtype is an instance of CategoricalDtype
codes, cats, na_count = _categorical_convert(
Expand Down Expand Up @@ -1205,6 +1211,7 @@ cdef class TextReader:
return self.converters.get(i)

cdef _get_na_list(self, Py_ssize_t i, name):
# Note: updates self.na_values, self.na_fvalues
if self.na_values is None:
return None, set()

Expand Down
60 changes: 34 additions & 26 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import itertools
from typing import (
Any,
Callable,
DefaultDict,
Dict,
Iterable,
Expand All @@ -27,6 +28,7 @@
from pandas._typing import (
DtypeArg,
FilePathOrBuffer,
final,
)
from pandas.errors import (
ParserError,
Expand Down Expand Up @@ -114,6 +116,9 @@


class ParserBase:
_implicit_index: bool = False
_first_chunk: bool

def __init__(self, kwds):

self.names = kwds.get("names")
Expand Down Expand Up @@ -268,15 +273,17 @@ def close(self):
if self.handles is not None:
self.handles.close()

@final
@property
def _has_complex_date_col(self):
def _has_complex_date_col(self) -> bool:
return isinstance(self.parse_dates, dict) or (
isinstance(self.parse_dates, list)
and len(self.parse_dates) > 0
and isinstance(self.parse_dates[0], list)
)

def _should_parse_dates(self, i):
@final
def _should_parse_dates(self, i: int) -> bool:
if isinstance(self.parse_dates, bool):
return self.parse_dates
else:
Expand All @@ -295,8 +302,9 @@ def _should_parse_dates(self, i):
name is not None and name in self.parse_dates
)

@final
def _extract_multi_indexer_columns(
self, header, index_names, col_names, passed_names=False
self, header, index_names, col_names, passed_names: bool = False
):
"""
extract and return the names, index_names, col_names
Expand Down Expand Up @@ -354,6 +362,7 @@ def extract(r):

return names, index_names, col_names, passed_names

@final
def _maybe_dedup_names(self, names):
# see gh-7160 and gh-9424: this helps to provide
# immediate alleviation of the duplicate names
Expand Down Expand Up @@ -382,12 +391,14 @@ def _maybe_dedup_names(self, names):

return names

@final
def _maybe_make_multi_index_columns(self, columns, col_names=None):
# possibly create a column mi here
if _is_potential_multi_index(columns):
columns = MultiIndex.from_tuples(columns, names=col_names)
return columns

@final
def _make_index(self, data, alldata, columns, indexnamerow=False):
if not is_index_col(self.index_col) or not self.index_col:
index = None
Expand Down Expand Up @@ -415,8 +426,7 @@ def _make_index(self, data, alldata, columns, indexnamerow=False):

return index, columns

_implicit_index = False

@final
def _get_simple_index(self, data, columns):
def ix(col):
if not isinstance(col, str):
Expand All @@ -439,6 +449,7 @@ def ix(col):

return index

@final
def _get_complex_date_index(self, data, col_names):
def _get_name(icol):
if isinstance(icol, str):
Expand Down Expand Up @@ -466,7 +477,8 @@ def _get_name(icol):

return index

def _agg_index(self, index, try_parse_dates=True) -> Index:
@final
def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
arrays = []

for i, arr in enumerate(index):
Expand Down Expand Up @@ -497,8 +509,15 @@ def _agg_index(self, index, try_parse_dates=True) -> Index:

return index

@final
def _convert_to_ndarrays(
self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None
self,
dct: dict,
na_values,
na_fvalues,
verbose: bool = False,
converters=None,
dtypes=None,
):
result = {}
for c, values in dct.items():
Expand Down Expand Up @@ -575,6 +594,7 @@ def _convert_to_ndarrays(
print(f"Filled {na_count} NA values in column {c!s}")
return result

@final
def _set_noconvert_dtype_columns(
self, col_indices: List[int], names: List[Union[int, str, Tuple]]
) -> Set[int]:
Expand Down Expand Up @@ -1010,12 +1030,12 @@ def converter(*date_cols):

def _process_date_conversion(
data_dict,
converter,
converter: Callable,
parse_spec,
index_col,
index_names,
columns,
keep_date_col=False,
keep_date_col: bool = False,
):
def _isindex(colspec):
return (isinstance(index_col, list) and colspec in index_col) or (
Expand Down Expand Up @@ -1077,7 +1097,7 @@ def _isindex(colspec):
return data_dict, new_cols


def _try_convert_dates(parser, colspec, data_dict, columns):
def _try_convert_dates(parser: Callable, colspec, data_dict, columns):
colset = set(columns)
colnames = []

Expand Down Expand Up @@ -1131,21 +1151,9 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na):
return na_values, na_fvalues


# Seems to be unused
def _get_col_names(colspec, columns):
colset = set(columns)
colnames = []
for c in colspec:
if c in colset:
colnames.append(c)
elif isinstance(c, int):
colnames.append(columns[c])
return colnames


def _is_potential_multi_index(
columns, index_col: Optional[Union[bool, Sequence[int]]] = None
):
) -> bool:
"""
Check whether or not the `columns` parameter
could be converted into a MultiIndex.
Expand All @@ -1159,12 +1167,12 @@ def _is_potential_multi_index(

Returns
-------
boolean : Whether or not columns could become a MultiIndex
bool : Whether or not columns could become a MultiIndex
"""
if index_col is None or isinstance(index_col, bool):
index_col = []

return (
return bool(
len(columns)
and not isinstance(columns, MultiIndex)
and all(isinstance(c, tuple) for c in columns if c not in list(index_col))
Expand Down Expand Up @@ -1193,5 +1201,5 @@ def _validate_parse_dates_arg(parse_dates):
return parse_dates


def is_index_col(col):
def is_index_col(col) -> bool:
return col is not None and col is not False
Loading