Skip to content

Commit 4e7e2c8

Browse files
authored
TYP: io.parsers (#41201)
1 parent 0d9427f commit 4e7e2c8

File tree

5 files changed

+67
-52
lines changed

5 files changed

+67
-52
lines changed

pandas/_libs/parsers.pyi

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ class TextReader:
3131
source,
3232
delimiter: bytes | str = ..., # single-character only
3333
header=...,
34-
header_start=...,
35-
header_end=...,
34+
header_start: int = ..., # int64_t
35+
header_end: int = ..., # uint64_t
3636
index_col=...,
3737
names=...,
3838
tokenize_chunksize: int = ..., # int64_t

pandas/_libs/parsers.pyx

Lines changed: 25 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -101,13 +101,13 @@ from pandas.errors import (
101101

102102
from pandas.core.dtypes.common import (
103103
is_bool_dtype,
104-
is_categorical_dtype,
105104
is_datetime64_dtype,
106105
is_extension_array_dtype,
107106
is_float_dtype,
108107
is_integer_dtype,
109108
is_object_dtype,
110109
)
110+
from pandas.core.dtypes.dtypes import CategoricalDtype
111111

112112
cdef:
113113
float64_t INF = <float64_t>np.inf
@@ -305,35 +305,36 @@ cdef class TextReader:
305305
object na_fvalues
306306
object true_values, false_values
307307
object handle
308+
object orig_header
308309
bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
309-
uint64_t parser_start
310+
bint mangle_dupe_cols, allow_leading_cols
311+
uint64_t parser_start # this is modified after __init__
310312
list clocks
311313
const char *encoding_errors
312314
kh_str_starts_t *false_set
313315
kh_str_starts_t *true_set
316+
int64_t buffer_lines, skipfooter
317+
list dtype_cast_order # list[np.dtype]
318+
list names # can be None
319+
set noconvert # set[int]
314320

315321
cdef public:
316-
int64_t leading_cols, table_width, skipfooter, buffer_lines
317-
bint allow_leading_cols, mangle_dupe_cols
318-
bint delim_whitespace
322+
int64_t leading_cols, table_width
319323
object delimiter # bytes or str
320324
object converters
321325
object na_values
322-
object orig_header, names, header_start, header_end
323326
list header # list[list[non-negative integers]]
324327
object index_col
325328
object skiprows
326329
object dtype
327330
object usecols
328-
list dtype_cast_order # list[np.dtype]
329331
set unnamed_cols # set[str]
330-
set noconvert # set[int]
331332

332333
def __cinit__(self, source,
333334
delimiter=b',', # bytes | str
334335
header=0,
335-
header_start=0,
336-
header_end=0,
336+
int64_t header_start=0,
337+
uint64_t header_end=0,
337338
index_col=None,
338339
names=None,
339340
tokenize_chunksize=DEFAULT_CHUNKSIZE,
@@ -457,7 +458,6 @@ cdef class TextReader:
457458
self.parser.warn_bad_lines = 0
458459

459460
self.delimiter = delimiter
460-
self.delim_whitespace = delim_whitespace
461461

462462
self.na_values = na_values
463463
if na_fvalues is None:
@@ -502,7 +502,7 @@ cdef class TextReader:
502502
# header stuff
503503

504504
self.allow_leading_cols = allow_leading_cols
505-
self.leading_cols = 0
505+
self.leading_cols = 0 # updated in _get_header
506506

507507
# TODO: no header vs. header is not the first row
508508
self.has_mi_columns = 0
@@ -535,10 +535,11 @@ cdef class TextReader:
535535
self.parser.header_end = header
536536
self.parser_start = header + 1
537537
self.parser.header = header
538-
prelim_header = [ header ]
538+
prelim_header = [header]
539539

540540
self.names = names
541541
header, table_width, unnamed_cols = self._get_header(prelim_header)
542+
# header, table_width, and unnamed_cols are set here, never changed
542543
self.header = header
543544
self.table_width = table_width
544545
self.unnamed_cols = unnamed_cols
@@ -618,6 +619,11 @@ cdef class TextReader:
618619

619620
cdef _get_header(self, list prelim_header):
620621
# header is now a list of lists, so field_count should use header[0]
622+
#
623+
# modifies:
624+
# self.parser attributes
625+
# self.parser_start
626+
# self.leading_cols
621627

622628
cdef:
623629
Py_ssize_t i, start, field_count, passed_count, unnamed_count, level
@@ -710,7 +716,7 @@ cdef class TextReader:
710716
header.append(this_header)
711717

712718
if self.names is not None:
713-
header = [ self.names ]
719+
header = [self.names]
714720

715721
elif self.names is not None:
716722
# Enforce this unless usecols
@@ -721,7 +727,7 @@ cdef class TextReader:
721727
if self.parser.lines < 1:
722728
self._tokenize_rows(1)
723729

724-
header = [ self.names ]
730+
header = [self.names]
725731

726732
if self.parser.lines < 1:
727733
field_count = len(header[0])
@@ -778,7 +784,7 @@ cdef class TextReader:
778784
"""
779785
# Conserve intermediate space
780786
# Caller is responsible for concatenating chunks,
781-
# see c_parser_wrapper._concatenatve_chunks
787+
# see c_parser_wrapper._concatenate_chunks
782788
cdef:
783789
size_t rows_read = 0
784790
list chunks = []
@@ -885,7 +891,7 @@ cdef class TextReader:
885891
cdef _start_clock(self):
886892
self.clocks.append(time.time())
887893

888-
cdef _end_clock(self, what):
894+
cdef _end_clock(self, str what):
889895
if self.verbose:
890896
elapsed = time.time() - self.clocks.pop(-1)
891897
print(f'{what} took: {elapsed * 1000:.2f} ms')
@@ -1090,7 +1096,7 @@ cdef class TextReader:
10901096
bint user_dtype,
10911097
kh_str_starts_t *na_hashset,
10921098
object na_flist):
1093-
if is_categorical_dtype(dtype):
1099+
if isinstance(dtype, CategoricalDtype):
10941100
# TODO: I suspect that _categorical_convert could be
10951101
# optimized when dtype is an instance of CategoricalDtype
10961102
codes, cats, na_count = _categorical_convert(
@@ -1205,6 +1211,7 @@ cdef class TextReader:
12051211
return self.converters.get(i)
12061212

12071213
cdef _get_na_list(self, Py_ssize_t i, name):
1214+
# Note: updates self.na_values, self.na_fvalues
12081215
if self.na_values is None:
12091216
return None, set()
12101217

pandas/io/parsers/base_parser.py

Lines changed: 34 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import itertools
55
from typing import (
66
Any,
7+
Callable,
78
DefaultDict,
89
Dict,
910
Iterable,
@@ -27,6 +28,7 @@
2728
from pandas._typing import (
2829
DtypeArg,
2930
FilePathOrBuffer,
31+
final,
3032
)
3133
from pandas.errors import (
3234
ParserError,
@@ -114,6 +116,9 @@
114116

115117

116118
class ParserBase:
119+
_implicit_index: bool = False
120+
_first_chunk: bool
121+
117122
def __init__(self, kwds):
118123

119124
self.names = kwds.get("names")
@@ -268,15 +273,17 @@ def close(self):
268273
if self.handles is not None:
269274
self.handles.close()
270275

276+
@final
271277
@property
272-
def _has_complex_date_col(self):
278+
def _has_complex_date_col(self) -> bool:
273279
return isinstance(self.parse_dates, dict) or (
274280
isinstance(self.parse_dates, list)
275281
and len(self.parse_dates) > 0
276282
and isinstance(self.parse_dates[0], list)
277283
)
278284

279-
def _should_parse_dates(self, i):
285+
@final
286+
def _should_parse_dates(self, i: int) -> bool:
280287
if isinstance(self.parse_dates, bool):
281288
return self.parse_dates
282289
else:
@@ -295,8 +302,9 @@ def _should_parse_dates(self, i):
295302
name is not None and name in self.parse_dates
296303
)
297304

305+
@final
298306
def _extract_multi_indexer_columns(
299-
self, header, index_names, col_names, passed_names=False
307+
self, header, index_names, col_names, passed_names: bool = False
300308
):
301309
"""
302310
extract and return the names, index_names, col_names
@@ -354,6 +362,7 @@ def extract(r):
354362

355363
return names, index_names, col_names, passed_names
356364

365+
@final
357366
def _maybe_dedup_names(self, names):
358367
# see gh-7160 and gh-9424: this helps to provide
359368
# immediate alleviation of the duplicate names
@@ -382,12 +391,14 @@ def _maybe_dedup_names(self, names):
382391

383392
return names
384393

394+
@final
385395
def _maybe_make_multi_index_columns(self, columns, col_names=None):
386396
# possibly create a column mi here
387397
if _is_potential_multi_index(columns):
388398
columns = MultiIndex.from_tuples(columns, names=col_names)
389399
return columns
390400

401+
@final
391402
def _make_index(self, data, alldata, columns, indexnamerow=False):
392403
if not is_index_col(self.index_col) or not self.index_col:
393404
index = None
@@ -415,8 +426,7 @@ def _make_index(self, data, alldata, columns, indexnamerow=False):
415426

416427
return index, columns
417428

418-
_implicit_index = False
419-
429+
@final
420430
def _get_simple_index(self, data, columns):
421431
def ix(col):
422432
if not isinstance(col, str):
@@ -439,6 +449,7 @@ def ix(col):
439449

440450
return index
441451

452+
@final
442453
def _get_complex_date_index(self, data, col_names):
443454
def _get_name(icol):
444455
if isinstance(icol, str):
@@ -466,7 +477,8 @@ def _get_name(icol):
466477

467478
return index
468479

469-
def _agg_index(self, index, try_parse_dates=True) -> Index:
480+
@final
481+
def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
470482
arrays = []
471483

472484
for i, arr in enumerate(index):
@@ -497,8 +509,15 @@ def _agg_index(self, index, try_parse_dates=True) -> Index:
497509

498510
return index
499511

512+
@final
500513
def _convert_to_ndarrays(
501-
self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None
514+
self,
515+
dct: dict,
516+
na_values,
517+
na_fvalues,
518+
verbose: bool = False,
519+
converters=None,
520+
dtypes=None,
502521
):
503522
result = {}
504523
for c, values in dct.items():
@@ -575,6 +594,7 @@ def _convert_to_ndarrays(
575594
print(f"Filled {na_count} NA values in column {c!s}")
576595
return result
577596

597+
@final
578598
def _set_noconvert_dtype_columns(
579599
self, col_indices: List[int], names: List[Union[int, str, Tuple]]
580600
) -> Set[int]:
@@ -1010,12 +1030,12 @@ def converter(*date_cols):
10101030

10111031
def _process_date_conversion(
10121032
data_dict,
1013-
converter,
1033+
converter: Callable,
10141034
parse_spec,
10151035
index_col,
10161036
index_names,
10171037
columns,
1018-
keep_date_col=False,
1038+
keep_date_col: bool = False,
10191039
):
10201040
def _isindex(colspec):
10211041
return (isinstance(index_col, list) and colspec in index_col) or (
@@ -1077,7 +1097,7 @@ def _isindex(colspec):
10771097
return data_dict, new_cols
10781098

10791099

1080-
def _try_convert_dates(parser, colspec, data_dict, columns):
1100+
def _try_convert_dates(parser: Callable, colspec, data_dict, columns):
10811101
colset = set(columns)
10821102
colnames = []
10831103

@@ -1131,21 +1151,9 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na):
11311151
return na_values, na_fvalues
11321152

11331153

1134-
# Seems to be unused
1135-
def _get_col_names(colspec, columns):
1136-
colset = set(columns)
1137-
colnames = []
1138-
for c in colspec:
1139-
if c in colset:
1140-
colnames.append(c)
1141-
elif isinstance(c, int):
1142-
colnames.append(columns[c])
1143-
return colnames
1144-
1145-
11461154
def _is_potential_multi_index(
11471155
columns, index_col: Optional[Union[bool, Sequence[int]]] = None
1148-
):
1156+
) -> bool:
11491157
"""
11501158
Check whether or not the `columns` parameter
11511159
could be converted into a MultiIndex.
@@ -1159,12 +1167,12 @@ def _is_potential_multi_index(
11591167
11601168
Returns
11611169
-------
1162-
boolean : Whether or not columns could become a MultiIndex
1170+
bool : Whether or not columns could become a MultiIndex
11631171
"""
11641172
if index_col is None or isinstance(index_col, bool):
11651173
index_col = []
11661174

1167-
return (
1175+
return bool(
11681176
len(columns)
11691177
and not isinstance(columns, MultiIndex)
11701178
and all(isinstance(c, tuple) for c in columns if c not in list(index_col))
@@ -1193,5 +1201,5 @@ def _validate_parse_dates_arg(parse_dates):
11931201
return parse_dates
11941202

11951203

1196-
def is_index_col(col):
1204+
def is_index_col(col) -> bool:
11971205
return col is not None and col is not False

0 commit comments

Comments
 (0)