-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: Raise ValueError if a column index in usecols is out of bounds. … #25686
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
caec3c5
10126e1
4eb55be
c86114b
52d9a2d
e04654b
2d22a93
98193d5
2d8835d
048094d
bdea635
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -123,7 +123,7 @@ Bug Fixes | |
~~~~~~~~~ | ||
- Bug in :func:`to_datetime` which would raise an (incorrect) ``ValueError`` when called with a date far into the future and the ``format`` argument specified instead of raising ``OutOfBoundsDatetime`` (:issue:`23830`) | ||
- Bug in an error message in :meth:`DataFrame.plot`. Improved the error message if non-numerics are passed to :meth:`DataFrame.plot` (:issue:`25481`) | ||
- | ||
- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`) | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this needs to be moved to the I/O section of bug fixes |
||
|
||
Categorical | ||
^^^^^^^^^^^ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1894,6 +1894,11 @@ def __init__(self, src, **kwds): | |
not set(usecols).issubset(self.orig_names)): | ||
_validate_usecols_names(usecols, self.orig_names) | ||
|
||
# GH 25623 | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
elif self.usecols_dtype == 'integer': | ||
indices = lrange(self._reader.table_width) | ||
_validate_usecols_names(usecols, indices) | ||
|
||
if len(self.names) > len(usecols): | ||
self.names = [n for i, n in enumerate(self.names) | ||
if (i in usecols or n in usecols)] | ||
|
@@ -2197,7 +2202,8 @@ def __init__(self, f, **kwds): | |
self.skipinitialspace = kwds['skipinitialspace'] | ||
self.lineterminator = kwds['lineterminator'] | ||
self.quoting = kwds['quoting'] | ||
self.usecols, _ = _validate_usecols_arg(kwds['usecols']) | ||
self.usecols, self.usecols_dtype = _validate_usecols_arg( | ||
kwds['usecols']) | ||
self.skip_blank_lines = kwds['skip_blank_lines'] | ||
|
||
self.warn_bad_lines = kwds['warn_bad_lines'] | ||
|
@@ -2588,6 +2594,12 @@ def _infer_columns(self): | |
if clear_buffer: | ||
self._clear_buffer() | ||
|
||
# GH 25623 | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if self.usecols_dtype == 'integer': | ||
for col in columns: | ||
indices = lrange(len(col)) | ||
_validate_usecols_names(self.usecols, indices) | ||
|
||
if names is not None: | ||
if ((self.usecols is not None and | ||
len(names) != len(self.usecols)) or | ||
|
@@ -2623,6 +2635,10 @@ def _infer_columns(self): | |
ncols = len(line) | ||
num_original_columns = ncols | ||
|
||
# GH25623 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here as well |
||
if self.usecols_dtype == 'integer': | ||
_validate_usecols_names(self.usecols, lrange(ncols)) | ||
|
||
if not names: | ||
if self.prefix: | ||
columns = [['%s%d' % (self.prefix, i) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,20 @@ | |
"expected but not found: {0}") | ||
|
||
|
||
@pytest.mark.parametrize("names,usecols", [ | ||
(None, [0, 3]), | ||
(["a", "b", "c"], [0, -1, 2]), | ||
(None, [3]), | ||
(["a"], [3]) | ||
]) | ||
def test_usecols_out_of_bounds(all_parsers, names, usecols): | ||
data = "a,b,c\n1,2,3\n4,5,6" | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
parser = all_parsers | ||
|
||
with pytest.raises(ValueError, match=_msg_validate_usecols_names): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. although this technically doesn't fail the regex, it appears that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea good catch @simonjayhawkins There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. indeed! @jreback I'll push an update this weekend |
||
parser.read_csv(StringIO(data), usecols=usecols, names=names) | ||
|
||
|
||
def test_raise_on_mixed_dtype_usecols(all_parsers): | ||
# See gh-12678 | ||
data = """a,b,c | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove these; these have already been moved