diff --git a/doc/source/io.rst b/doc/source/io.rst index ff505f525fc22..fd998d32cfbfb 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -130,11 +130,11 @@ index_col : int or sequence or ``False``, default ``None`` MultiIndex is used. If you have a malformed file with delimiters at the end of each line, you might consider ``index_col=False`` to force pandas to *not* use the first column as the index (row names). -usecols : array-like or callable, default ``None`` - Return a subset of the columns. If array-like, all elements must either +usecols : list-like or callable, default ``None`` + Return a subset of the columns. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in `names` or - inferred from the document header row(s). For example, a valid array-like + inferred from the document header row(s). For example, a valid list-like `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index e83f149db1f18..6524012d27fc9 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1077,6 +1077,7 @@ I/O - Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`) - Bug in :func:`read_pickle` when unpickling objects with :class:`TimedeltaIndex` or :class:`Float64Index` created with pandas prior to version 0.20 (:issue:`19939`) - Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`) +- Bug in ``usecols`` parameter in :func:`pandas.io.read_csv` and :func:`pandas.io.read_table` where error is not raised correctly when passing a string. (:issue:`20529`) Plotting ^^^^^^^^ diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 52ca3d1226f79..a24e2cdd99f6f 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -445,10 +445,9 @@ cdef class TextReader: # suboptimal if usecols is not None: self.has_usecols = 1 - if callable(usecols): - self.usecols = usecols - else: - self.usecols = set(usecols) + # GH-20558, validate usecols at higher level and only pass clean + # usecols into TextReader. + self.usecols = usecols # XXX if skipfooter > 0: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 469cd6d82e4b4..780aa5d02f598 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -97,11 +97,11 @@ MultiIndex is used. If you have a malformed file with delimiters at the end of each line, you might consider index_col=False to force pandas to _not_ use the first column as the index (row names) -usecols : array-like or callable, default None - Return a subset of the columns. If array-like, all elements must either +usecols : list-like or callable, default None + Return a subset of the columns. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in `names` or - inferred from the document header row(s). For example, a valid array-like + inferred from the document header row(s). For example, a valid list-like `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To instantiate a DataFrame from ``data`` with element order preserved use @@ -1177,7 +1177,7 @@ def _validate_usecols_arg(usecols): Parameters ---------- - usecols : array-like, callable, or None + usecols : list-like, callable, or None List of columns to use when parsing or a callable that can be used to filter a list of table columns. @@ -1192,17 +1192,19 @@ def _validate_usecols_arg(usecols): 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like is passed in or None if a callable or None is passed in. """ - msg = ("'usecols' must either be all strings, all unicode, " - "all integers or a callable") - + msg = ("'usecols' must either be list-like of all strings, all unicode, " + "all integers or a callable.") if usecols is not None: if callable(usecols): return usecols, None - usecols_dtype = lib.infer_dtype(usecols) - if usecols_dtype not in ('empty', 'integer', - 'string', 'unicode'): + # GH20529, ensure is iterable container but not string. + elif not is_list_like(usecols): raise ValueError(msg) - + else: + usecols_dtype = lib.infer_dtype(usecols) + if usecols_dtype not in ('empty', 'integer', + 'string', 'unicode'): + raise ValueError(msg) return set(usecols), usecols_dtype return usecols, None @@ -1697,11 +1699,12 @@ def __init__(self, src, **kwds): # #2442 kwds['allow_leading_cols'] = self.index_col is not False - self._reader = parsers.TextReader(src, **kwds) - - # XXX + # GH20529, validate usecol arg before TextReader self.usecols, self.usecols_dtype = _validate_usecols_arg( - self._reader.usecols) + kwds['usecols']) + kwds['usecols'] = self.usecols + + self._reader = parsers.TextReader(src, **kwds) passed_names = self.names is None diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index 195fb4cba2aed..584711528e9cb 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -16,6 +16,11 @@ class UsecolsTests(object): + msg_validate_usecols_arg = ("'usecols' must either be list-like of all " + "strings, all unicode, all integers or a " + "callable.") + msg_validate_usecols_names = ("Usecols do not match columns, columns " + "expected but not found: {0}") def test_raise_on_mixed_dtype_usecols(self): # See gh-12678 @@ -24,11 +29,9 @@ def test_raise_on_mixed_dtype_usecols(self): 4000,5000,6000 """ - msg = ("'usecols' must either be all strings, all unicode, " - "all integers or a callable") usecols = [0, 'b', 2] - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg): self.read_csv(StringIO(data), usecols=usecols) def test_usecols(self): @@ -85,6 +88,18 @@ def test_usecols(self): pytest.raises(ValueError, self.read_csv, StringIO(data), names=['a', 'b'], usecols=[1], header=None) + def test_usecols_single_string(self): + # GH 20558 + data = """foo, bar, baz + 1000, 2000, 3000 + 4000, 5000, 6000 + """ + + usecols = 'foo' + + with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg): + self.read_csv(StringIO(data), usecols=usecols) + def test_usecols_index_col_False(self): # see gh-9082 s = "a,b,c,d\n1,2,3,4\n5,6,7,8" @@ -348,13 +363,10 @@ def test_usecols_with_mixed_encoding_strings(self): 3.568935038,7,False,a ''' - msg = ("'usecols' must either be all strings, all unicode, " - "all integers or a callable") - - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg): self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB']) - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg): self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB']) def test_usecols_with_multibyte_characters(self): @@ -480,11 +492,6 @@ def test_raise_on_usecols_names_mismatch(self): # GH 14671 data = 'a,b,c,d\n1,2,3,4\n5,6,7,8' - msg = ( - "Usecols do not match columns, " - "columns expected but not found: {missing}" - ) - usecols = ['a', 'b', 'c', 'd'] df = self.read_csv(StringIO(data), usecols=usecols) expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7], @@ -492,18 +499,21 @@ def test_raise_on_usecols_names_mismatch(self): tm.assert_frame_equal(df, expected) usecols = ['a', 'b', 'c', 'f'] - with tm.assert_raises_regex( - ValueError, msg.format(missing=r"\['f'\]")): + with tm.assert_raises_regex(ValueError, + self.msg_validate_usecols_names.format( + r"\['f'\]")): self.read_csv(StringIO(data), usecols=usecols) usecols = ['a', 'b', 'f'] - with tm.assert_raises_regex( - ValueError, msg.format(missing=r"\['f'\]")): + with tm.assert_raises_regex(ValueError, + self.msg_validate_usecols_names.format( + r"\['f'\]")): self.read_csv(StringIO(data), usecols=usecols) usecols = ['a', 'b', 'f', 'g'] - with tm.assert_raises_regex( - ValueError, msg.format(missing=r"\[('f', 'g'|'g', 'f')\]")): + with tm.assert_raises_regex(ValueError, + self.msg_validate_usecols_names.format( + r"\[('f', 'g'|'g', 'f')\]")): self.read_csv(StringIO(data), usecols=usecols) names = ['A', 'B', 'C', 'D'] @@ -527,11 +537,13 @@ def test_raise_on_usecols_names_mismatch(self): # tm.assert_frame_equal(df, expected) usecols = ['A', 'B', 'C', 'f'] - with tm.assert_raises_regex( - ValueError, msg.format(missing=r"\['f'\]")): + with tm.assert_raises_regex(ValueError, + self.msg_validate_usecols_names.format( + r"\['f'\]")): self.read_csv(StringIO(data), header=0, names=names, usecols=usecols) usecols = ['A', 'B', 'f'] - with tm.assert_raises_regex( - ValueError, msg.format(missing=r"\['f'\]")): + with tm.assert_raises_regex(ValueError, + self.msg_validate_usecols_names.format( + r"\['f'\]")): self.read_csv(StringIO(data), names=names, usecols=usecols)