From 6744b910f0c61d50d17ab86686f9fc0a6003d12c Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 27 Nov 2021 23:55:13 +0100 Subject: [PATCH 1/2] BUG: read_csv raising ParserError when some chunks have less columns than header --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/_libs/parsers.pyx | 9 ++-- .../tests/io/parser/common/test_chunksize.py | 45 +++++++++++++++++++ 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index e87f5f53256cf..4ba8f756254ee 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -655,6 +655,7 @@ I/O - Column headers are dropped when constructing a :class:`DataFrame` from a sqlalchemy's ``Row`` object (:issue:`40682`) - Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`) - Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`) +- Bug in :func:`read_csv` raising ``ParserError`` when reading file in chunks and aome chunk blocks have fewer columns than header for ``engine="c"`` (:issue:`21211`) - Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`) - Bug in :func:`read_csv` and :func:`read_fwf` ignoring all ``skiprows`` except first when ``nrows`` is specified for ``engine='python'`` (:issue:`44021`, :issue:`10261`) - Bug in :func:`read_json` not handling non-numpy dtypes correctly (especially ``category``) (:issue:`21892`, :issue:`33205`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index d2975f83b97d7..b7d81ed59352d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -927,9 +927,12 @@ cdef class TextReader: (num_cols >= self.parser.line_fields[i]) * num_cols if self.table_width - self.leading_cols > num_cols: - raise ParserError(f"Too many columns specified: expected " - f"{self.table_width - self.leading_cols} " - f"and found {num_cols}") + if ((not callable(self.usecols) and self.usecols and + self.table_width - self.leading_cols < len(self.usecols)) + or (self.names and len(self.names) - self.leading_cols > num_cols)): + raise ParserError(f"Too many columns specified: expected " + f"{self.table_width - self.leading_cols} " + f"and found {num_cols}") if (self.usecols is not None and not callable(self.usecols) and all(isinstance(u, int) for u in self.usecols)): diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index e8a8769bc6291..4c26047d98acc 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -248,3 +248,48 @@ def test_read_csv_memory_growth_chunksize(all_parsers): with parser.read_csv(path, chunksize=20) as result: for _ in result: pass + + +def test_chunksize_with_usecols_second_block_shorter(all_parsers): + # GH#21211 + parser = all_parsers + data = """1,2,3,4 +5,6,7,8 +9,10,11 +""" + + result_chunks = parser.read_csv( + StringIO(data), + names=["a", "b"], + chunksize=2, + usecols=[0, 1], + header=None, + ) + + expected_frames = [ + DataFrame({"a": [1, 5], "b": [2, 6]}), + DataFrame({"a": [9], "b": [10]}, index=[2]), + ] + + for i, result in enumerate(result_chunks): + tm.assert_frame_equal(result, expected_frames[i]) + + +def test_chunksize_second_block_shorter(all_parsers): + # GH#21211 + parser = all_parsers + data = """a,b,c,d +1,2,3,4 +5,6,7,8 +9,10,11 +""" + + result_chunks = parser.read_csv(StringIO(data), chunksize=2) + + expected_frames = [ + DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), + DataFrame({"a": [9], "b": [10], "c": [11], "d": [np.nan]}, index=[2]), + ] + + for i, result in enumerate(result_chunks): + tm.assert_frame_equal(result, expected_frames[i]) From 12cfc0c85d33aa461e5b406749eb55aef0a9f99b Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 28 Nov 2021 03:18:03 +0100 Subject: [PATCH 2/2] Rename condition --- pandas/_libs/parsers.pyx | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index b7d81ed59352d..5a6ad2194f99f 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -926,15 +926,19 @@ cdef class TextReader: self.parser.line_fields[i] + \ (num_cols >= self.parser.line_fields[i]) * num_cols + usecols_not_callable_and_exists = not callable(self.usecols) and self.usecols + names_larger_num_cols = (self.names and + len(self.names) - self.leading_cols > num_cols) + if self.table_width - self.leading_cols > num_cols: - if ((not callable(self.usecols) and self.usecols and - self.table_width - self.leading_cols < len(self.usecols)) - or (self.names and len(self.names) - self.leading_cols > num_cols)): + if (usecols_not_callable_and_exists + and self.table_width - self.leading_cols < len(self.usecols) + or names_larger_num_cols): raise ParserError(f"Too many columns specified: expected " f"{self.table_width - self.leading_cols} " f"and found {num_cols}") - if (self.usecols is not None and not callable(self.usecols) and + if (usecols_not_callable_and_exists and all(isinstance(u, int) for u in self.usecols)): missing_usecols = [col for col in self.usecols if col >= num_cols] if missing_usecols: