diff --git a/doc/source/io.rst b/doc/source/io.rst index a78222dd748ad..6b287a2eea532 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -120,8 +120,12 @@ index_col : int or sequence or ``False``, default ``None`` each line, you might consider ``index_col=False`` to force pandas to *not* use the first column as the index (row names). usecols : array-like, default ``None`` - Return a subset of the columns. Results in much faster parsing time and lower - memory usage + Return a subset of the columns. All elements in this array must either + be positional (i.e. integer indices into the document columns) or strings + that correspond to column names provided either by the user in `names` or + inferred from the document header row(s). For example, a valid `usecols` + parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter + results in much faster parsing time and lower memory usage. squeeze : boolean, default ``False`` If the parsed data only contains one column then return a Series. prefix : str, default ``None`` diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index ecb3ff5139ad0..f991be3dc3e10 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -101,7 +101,7 @@ API changes - ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`) - +- ``read_csv`` no longer allows a combination of strings and integers for the ``usecols`` parameter (:issue:`12678`) - ``pd.show_versions()`` now includes ``pandas_datareader`` version (:issue:`12740`) - Provide a proper ``__name__`` and ``__qualname__`` attributes for generic functions (:issue:`12021`) @@ -211,6 +211,7 @@ Bug Fixes - Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`) - Bug in ``Panel.fillna()`` ignoring ``inplace=True`` (:issue:`12633`) +- Bug in ``read_csv`` when specifying ``names``, ```usecols``, and ``parse_dates`` simultaneously with the C engine (:issue:`9755`) - Bug in ``Series.rename``, ``DataFrame.rename`` and ``DataFrame.rename_axis`` not treating ``Series`` as mappings to relabel (:issue:`12623`). - Clean in ``.rolling.min`` and ``.rolling.max`` to enhance dtype handling (:issue:`12373`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7bd8a593661c5..bd14862df4e8e 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -75,8 +75,12 @@ class ParserWarning(Warning): of each line, you might consider index_col=False to force pandas to _not_ use the first column as the index (row names) usecols : array-like, default None - Return a subset of the columns. - Results in much faster parsing time and lower memory usage. + Return a subset of the columns. All elements in this array must either + be positional (i.e. integer indices into the document columns) or strings + that correspond to column names provided either by the user in `names` or + inferred from the document header row(s). For example, a valid `usecols` + parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter + results in much faster parsing time and lower memory usage. squeeze : boolean, default False If the parsed data only contains one column then return a Series prefix : str, default None @@ -801,6 +805,26 @@ def _is_index_col(col): return col is not None and col is not False +def _validate_usecols_arg(usecols): + """ + Check whether or not the 'usecols' parameter + contains all integers (column selection by index) + or strings (column by name). Raises a ValueError + if that is not the case. + """ + # gh-12678 + if usecols is not None: + usecols_dtype = lib.infer_dtype(usecols) + if usecols_dtype not in ('integer', 'string'): + raise ValueError(("The elements of 'usecols' " + "must either be all strings " + "or all integers")) + + # validation has succeeded, so + # return the argument for assignment + return usecols + + class ParserBase(object): def __init__(self, kwds): @@ -1132,7 +1156,7 @@ def __init__(self, src, **kwds): self._reader = _parser.TextReader(src, **kwds) # XXX - self.usecols = self._reader.usecols + self.usecols = _validate_usecols_arg(self._reader.usecols) passed_names = self.names is None @@ -1157,18 +1181,21 @@ def __init__(self, src, **kwds): else: self.names = lrange(self._reader.table_width) - # If the names were inferred (not passed by user) and usedcols is - # defined, then ensure names refers to the used columns, not the - # document's columns. - if self.usecols and passed_names: - col_indices = [] - for u in self.usecols: - if isinstance(u, string_types): - col_indices.append(self.names.index(u)) - else: - col_indices.append(u) - self.names = [n for i, n in enumerate(self.names) - if i in col_indices] + # gh-9755 + # + # need to set orig_names here first + # so that proper indexing can be done + # with _set_noconvert_columns + # + # once names has been filtered, we will + # then set orig_names again to names + self.orig_names = self.names[:] + + if self.usecols: + if len(self.names) > len(self.usecols): + self.names = [n for i, n in enumerate(self.names) + if (i in self.usecols or n in self.usecols)] + if len(self.names) < len(self.usecols): raise ValueError("Usecols do not match names.") @@ -1194,13 +1221,17 @@ def __init__(self, src, **kwds): self._implicit_index = self._reader.leading_cols > 0 def _set_noconvert_columns(self): - names = self.names + names = self.orig_names + usecols = self.usecols def _set(x): - if com.is_integer(x): - self._reader.set_noconvert(x) - else: - self._reader.set_noconvert(names.index(x)) + if usecols and com.is_integer(x): + x = list(usecols)[x] + + if not com.is_integer(x): + x = names.index(x) + + self._reader.set_noconvert(x) if isinstance(self.parse_dates, list): for val in self.parse_dates: @@ -1472,7 +1503,7 @@ def __init__(self, f, **kwds): self.lineterminator = kwds['lineterminator'] self.quoting = kwds['quoting'] self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) - self.usecols = kwds['usecols'] + self.usecols = _validate_usecols_arg(kwds['usecols']) self.skip_blank_lines = kwds['skip_blank_lines'] self.names_passed = kwds['names'] or None diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 7f523cf3aa54d..2d56275279453 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2682,12 +2682,118 @@ def test_uneven_lines_with_usecols(self): df = self.read_csv(StringIO(csv), usecols=usecols) tm.assert_frame_equal(df, expected) - usecols = ['a', 1] + usecols = ['a', 'b'] df = self.read_csv(StringIO(csv), usecols=usecols) tm.assert_frame_equal(df, expected) - usecols = ['a', 'b'] - df = self.read_csv(StringIO(csv), usecols=usecols) + def test_usecols_with_parse_dates(self): + # See gh-9755 + s = """a,b,c,d,e + 0,1,20140101,0900,4 + 0,1,20140102,1000,4""" + parse_dates = [[1, 2]] + + cols = { + 'a' : [0, 0], + 'c_d': [ + Timestamp('2014-01-01 09:00:00'), + Timestamp('2014-01-02 10:00:00') + ] + } + expected = DataFrame(cols, columns=['c_d', 'a']) + + df = self.read_csv(StringIO(s), usecols=[0, 2, 3], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO(s), usecols=[3, 0, 2], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_parse_dates_and_full_names(self): + # See gh-9755 + s = """0,1,20140101,0900,4 + 0,1,20140102,1000,4""" + parse_dates = [[1, 2]] + names = list('abcde') + + cols = { + 'a' : [0, 0], + 'c_d': [ + Timestamp('2014-01-01 09:00:00'), + Timestamp('2014-01-02 10:00:00') + ] + } + expected = DataFrame(cols, columns=['c_d', 'a']) + + df = self.read_csv(StringIO(s), names=names, + usecols=[0, 2, 3], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO(s), names=names, + usecols=[3, 0, 2], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_parse_dates_and_usecol_names(self): + # See gh-9755 + s = """0,1,20140101,0900,4 + 0,1,20140102,1000,4""" + parse_dates = [[1, 2]] + names = list('acd') + + cols = { + 'a' : [0, 0], + 'c_d': [ + Timestamp('2014-01-01 09:00:00'), + Timestamp('2014-01-02 10:00:00') + ] + } + expected = DataFrame(cols, columns=['c_d', 'a']) + + df = self.read_csv(StringIO(s), names=names, + usecols=[0, 2, 3], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO(s), names=names, + usecols=[3, 0, 2], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + def test_mixed_dtype_usecols(self): + # See gh-12678 + data = """a,b,c + 1000,2000,3000 + 4000,5000,6000 + """ + msg = ("The elements of \'usecols\' " + "must either be all strings " + "or all integers") + usecols = [0, 'b', 2] + + with tm.assertRaisesRegexp(ValueError, msg): + df = self.read_csv(StringIO(data), usecols=usecols) + + def test_usecols_with_integer_like_header(self): + data = """2,0,1 + 1000,2000,3000 + 4000,5000,6000 + """ + + usecols = [0, 1] # column selection by index + expected = DataFrame(data=[[1000, 2000], + [4000, 5000]], + columns=['2', '0']) + df = self.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(df, expected) + + usecols = ['0', '1'] # column selection by name + expected = DataFrame(data=[[2000, 3000], + [5000, 6000]], + columns=['0', '1']) + df = self.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(df, expected)