Skip to content

CLN: More read_csv state #59210

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 53 additions & 66 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,46 +274,34 @@ def _make_index(
self, data, alldata, columns, indexnamerow: list[Scalar] | None = None
) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
index: Index | None
if not is_index_col(self.index_col) or not self.index_col:
index = None
if isinstance(self.index_col, list) and len(self.index_col):
to_remove = []
indexes = []
for idx in self.index_col:
if isinstance(idx, str):
raise ValueError(f"Index {idx} invalid")
to_remove.append(idx)
indexes.append(alldata[idx])
# remove index items from content and columns, don't pop in
# loop
for i in sorted(to_remove, reverse=True):
alldata.pop(i)
if not self._implicit_index:
columns.pop(i)
index = self._agg_index(indexes)

# add names for the index
if indexnamerow:
coffset = len(indexnamerow) - len(columns)
index = index.set_names(indexnamerow[:coffset])
else:
simple_index = self._get_simple_index(alldata, columns)
index = self._agg_index(simple_index)

# add names for the index
if indexnamerow:
coffset = len(indexnamerow) - len(columns)
assert index is not None
index = index.set_names(indexnamerow[:coffset])
index = None

# maybe create a mi on the columns
columns = self._maybe_make_multi_index_columns(columns, self.col_names)

return index, columns

@final
def _get_simple_index(self, data, columns):
def ix(col):
if not isinstance(col, str):
return col
raise ValueError(f"Index {col} invalid")

to_remove = []
index = []
for idx in self.index_col:
i = ix(idx)
to_remove.append(i)
index.append(data[i])

# remove index items from content and columns, don't pop in
# loop
for i in sorted(to_remove, reverse=True):
data.pop(i)
if not self._implicit_index:
columns.pop(i)

return index

@final
def _clean_mapping(self, mapping):
"""converts col numbers to names"""
Expand All @@ -333,12 +321,13 @@ def _clean_mapping(self, mapping):
return clean

@final
def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
def _agg_index(self, index) -> Index:
arrays = []
converters = self._clean_mapping(self.converters)
clean_dtypes = self._clean_mapping(self.dtype)

for i, arr in enumerate(index):
if try_parse_dates and self._should_parse_dates(i):
if self._should_parse_dates(i):
arr = date_converter(
arr,
col=self.index_names[i] if self.index_names is not None else None,
Expand All @@ -364,8 +353,6 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
else:
col_na_values, col_na_fvalues = set(), set()

clean_dtypes = self._clean_mapping(self.dtype)

cast_type = None
index_converter = False
if self.index_names is not None:
Expand Down Expand Up @@ -632,35 +619,6 @@ def _check_data_length(
stacklevel=find_stack_level(),
)

@overload
def _evaluate_usecols(
self,
usecols: Callable[[Hashable], object],
names: Iterable[Hashable],
) -> set[int]: ...

@overload
def _evaluate_usecols(
self, usecols: SequenceT, names: Iterable[Hashable]
) -> SequenceT: ...

@final
def _evaluate_usecols(
self,
usecols: Callable[[Hashable], object] | SequenceT,
names: Iterable[Hashable],
) -> SequenceT | set[int]:
"""
Check whether or not the 'usecols' parameter
is a callable. If so, enumerates the 'names'
parameter and returns a set of indices for
each entry in 'names' that evaluates to True.
If not a callable, returns 'usecols'.
"""
if callable(usecols):
return {i for i, name in enumerate(names) if usecols(name)}
return usecols

@final
def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> SequenceT:
"""
Expand Down Expand Up @@ -988,3 +946,32 @@ def _validate_usecols_arg(usecols):

return usecols, usecols_dtype
return usecols, None


@overload
def evaluate_callable_usecols(
usecols: Callable[[Hashable], object],
names: Iterable[Hashable],
) -> set[int]: ...


@overload
def evaluate_callable_usecols(
usecols: SequenceT, names: Iterable[Hashable]
) -> SequenceT: ...


def evaluate_callable_usecols(
usecols: Callable[[Hashable], object] | SequenceT,
names: Iterable[Hashable],
) -> SequenceT | set[int]:
"""
Check whether or not the 'usecols' parameter
is a callable. If so, enumerates the 'names'
parameter and returns a set of indices for
each entry in 'names' that evaluates to True.
If not a callable, returns 'usecols'.
"""
if callable(usecols):
return {i for i, name in enumerate(names) if usecols(name)}
return usecols
49 changes: 22 additions & 27 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
ParserBase,
ParserError,
date_converter,
evaluate_callable_usecols,
is_index_col,
validate_parse_dates_presence,
)
Expand Down Expand Up @@ -133,7 +134,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
self.orig_names = self.names[:] # type: ignore[has-type]

if self.usecols:
usecols = self._evaluate_usecols(self.usecols, self.orig_names)
usecols = evaluate_callable_usecols(self.usecols, self.orig_names)

# GH 14671
# assert for mypy, orig_names is List or None, None would error in issubset
Expand Down Expand Up @@ -256,8 +257,7 @@ def read(
columns, self.col_names
)

if self.usecols is not None:
columns = self._filter_usecols(columns)
columns = _filter_usecols(self.usecols, columns)

col_dict = {k: v for k, v in col_dict.items() if k in columns}

Expand Down Expand Up @@ -290,13 +290,21 @@ def read(
else:
values = data.pop(self.index_col[i])

values = self._maybe_parse_dates(values, i, try_parse_dates=True)
if self._should_parse_dates(i):
values = date_converter(
values,
col=self.index_names[i]
if self.index_names is not None
else None,
dayfirst=self.dayfirst,
cache_dates=self.cache_dates,
date_format=self.date_format,
)
arrays.append(values)

index = ensure_index_from_sequences(arrays)

if self.usecols is not None:
names = self._filter_usecols(names)
names = _filter_usecols(self.usecols, names)

names = dedup_names(names, is_potential_multi_index(names, self.index_col))

Expand All @@ -320,8 +328,7 @@ def read(
names = list(self.orig_names)
names = dedup_names(names, is_potential_multi_index(names, self.index_col))

if self.usecols is not None:
names = self._filter_usecols(names)
names = _filter_usecols(self.usecols, names)

# columns as list
alldata = [x[1] for x in data_tups]
Expand All @@ -335,25 +342,13 @@ def read(

return index, column_names, date_data

def _filter_usecols(self, names: SequenceT) -> SequenceT | list[Hashable]:
# hackish
usecols = self._evaluate_usecols(self.usecols, names)
if usecols is not None and len(names) != len(usecols):
return [
name for i, name in enumerate(names) if i in usecols or name in usecols
]
return names

def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
if try_parse_dates and self._should_parse_dates(index):
values = date_converter(
values,
col=self.index_names[index] if self.index_names is not None else None,
dayfirst=self.dayfirst,
cache_dates=self.cache_dates,
date_format=self.date_format,
)
return values

def _filter_usecols(usecols, names: SequenceT) -> SequenceT | list[Hashable]:
# hackish
usecols = evaluate_callable_usecols(usecols, names)
if usecols is not None and len(names) != len(usecols):
return [name for i, name in enumerate(names) if i in usecols or name in usecols]
return names


def _concatenate_chunks(
Expand Down
24 changes: 9 additions & 15 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
)
from pandas.io.parsers.base_parser import (
ParserBase,
evaluate_callable_usecols,
get_na_values,
parser_defaults,
validate_parse_dates_presence,
Expand Down Expand Up @@ -127,9 +128,8 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
self.quoting = kwds["quoting"]
self.skip_blank_lines = kwds["skip_blank_lines"]

self.has_index_names = False
if "has_index_names" in kwds:
self.has_index_names = kwds["has_index_names"]
# Passed from read_excel
self.has_index_names = kwds.get("has_index_names", False)

self.thousands = kwds["thousands"]
self.decimal = kwds["decimal"]
Expand Down Expand Up @@ -299,9 +299,10 @@ def read(
return index, conv_columns, col_dict

# handle new style for names in index
count_empty_content_vals = count_empty_vals(content[0])
indexnamerow = None
if self.has_index_names and count_empty_content_vals == len(columns):
if self.has_index_names and sum(
int(v == "" or v is None) for v in content[0]
) == len(columns):
indexnamerow = content[0]
content = content[1:]

Expand Down Expand Up @@ -605,7 +606,7 @@ def _infer_columns(
# serve as the 'line' for parsing
if have_mi_columns and hr > 0:
if clear_buffer:
self._clear_buffer()
self.buf.clear()
columns.append([None] * len(columns[-1]))
return columns, num_original_columns, unnamed_cols

Expand Down Expand Up @@ -687,7 +688,7 @@ def _infer_columns(
num_original_columns = len(this_columns)

if clear_buffer:
self._clear_buffer()
self.buf.clear()

first_line: list[Scalar] | None
if names is not None:
Expand Down Expand Up @@ -774,7 +775,7 @@ def _handle_usecols(
col_indices: set[int] | list[int]
if self.usecols is not None:
if callable(self.usecols):
col_indices = self._evaluate_usecols(self.usecols, usecols_key)
col_indices = evaluate_callable_usecols(self.usecols, usecols_key)
elif any(isinstance(u, str) for u in self.usecols):
if len(columns) > 1:
raise ValueError(
Expand Down Expand Up @@ -1094,9 +1095,6 @@ def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
lines=lines, search=self.decimal, replace="."
)

def _clear_buffer(self) -> None:
self.buf = []

def _get_index_name(
self,
) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]:
Expand Down Expand Up @@ -1526,10 +1524,6 @@ def _remove_empty_lines(self, lines: list[list[T]]) -> list[list[T]]:
]


def count_empty_vals(vals) -> int:
return sum(1 for v in vals if v == "" or v is None)


def _validate_skipfooter_arg(skipfooter: int) -> int:
"""
Validate the 'skipfooter' parameter.
Expand Down