From 6bb558d6c118f1426211dc072e0095bcce0ed4dc Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 4 Jan 2017 19:37:27 -0800 Subject: [PATCH] BUG: Patch missing data handling with usecols Closes gh-6710. Closes gh-8985. --- doc/source/io.rst | 13 +++++++++++++ doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/parsers.py | 5 +++-- pandas/io/tests/parser/usecols.py | 25 +++++++++++++++++++++++++ pandas/parser.pyx | 3 ++- 5 files changed, 44 insertions(+), 3 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 9683fedb78303..dae97f7bc7f34 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1215,6 +1215,19 @@ You can elect to skip bad lines: 0 1 2 3 1 8 9 10 +You can also use the ``usecols`` parameter to eliminate extraneous column +data that appear in some lines but not others: + +.. code-block:: ipython + + In [30]: pd.read_csv(StringIO(data), usecols=[0, 1, 2]) + + Out[30]: + a b c + 0 1 2 3 + 1 4 5 6 + 2 8 9 10 + .. _io.quoting: Quoting and Escape Characters diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c9ea7b427b3f2..ef731ad5e92df 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -306,6 +306,7 @@ Bug Fixes - Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`) - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`) - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`) +- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 41f1ab6fc16fb..f2c3113fc2cdd 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2295,11 +2295,12 @@ def _infer_columns(self): columns = [lrange(ncols)] columns = self._handle_usecols(columns, columns[0]) else: - if self.usecols is None or len(names) == num_original_columns: + if self.usecols is None or len(names) >= num_original_columns: columns = self._handle_usecols([names], names) num_original_columns = len(names) else: - if self.usecols and len(names) != len(self.usecols): + if (not callable(self.usecols) and + len(names) != len(self.usecols)): raise ValueError( 'Number of passed names did not match number of ' 'header fields in the file' diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index c654859f8dc7d..96790e872abc3 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -440,3 +440,28 @@ def test_callable_usecols(self): expected = DataFrame() df = self.read_csv(StringIO(s), usecols=lambda x: False) tm.assert_frame_equal(df, expected) + + def test_incomplete_first_row(self): + # see gh-6710 + data = '1,2\n1,2,3' + names = ['a', 'b', 'c'] + expected = DataFrame({'a': [1, 1], + 'c': [np.nan, 3]}) + + usecols = ['a', 'c'] + df = self.read_csv(StringIO(data), names=names, usecols=usecols) + tm.assert_frame_equal(df, expected) + + usecols = lambda x: x in ['a', 'c'] + df = self.read_csv(StringIO(data), names=names, usecols=usecols) + tm.assert_frame_equal(df, expected) + + def test_uneven_length_cols(self): + # see gh-8985 + usecols = [0, 1, 2] + data = '19,29,39\n' * 2 + '10,20,30,40' + expected = DataFrame([[19, 29, 39], + [19, 29, 39], + [10, 20, 30]]) + df = self.read_csv(StringIO(data), header=None, usecols=usecols) + tm.assert_frame_equal(df, expected) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index c5082e999d19c..7b31f7fe27c1e 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -1317,7 +1317,8 @@ cdef class TextReader: cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused): if self.has_usecols and self.names is not None: - if len(self.names) == len(self.usecols): + if (not callable(self.usecols) and + len(self.names) == len(self.usecols)): return self.names[nused] else: return self.names[i - self.leading_cols]