From 2726a1376f03123a40058e9d7467bd225785ec21 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Thu, 19 Mar 2020 22:06:57 +0100 Subject: [PATCH 1/4] Fix read_csv IndexError crash for c engine with header=None and 2 (or more) extra columns --- pandas/_libs/parsers.pyx | 4 ++-- pandas/tests/io/parser/test_common.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 2085e91d69ed0..a0a1b4a206b9d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1316,8 +1316,8 @@ cdef class TextReader: else: if self.header is not None: j = i - self.leading_cols - # hack for #2442 - if j == len(self.header[0]): + # hack for #2442 and #26218 + if j >= len(self.header[0]): return j else: return self.header[0][j] diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 4a9fa61bc4233..5bf9587a6ca22 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2116,3 +2116,13 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): parser = all_parsers df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) tm.assert_frame_equal(df, ref[:nrows]) + + +def test_no_header_two_extra_columns(all_parsers): + # GH 26218 + column_names = ["one", "two", "three"] + ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) + stream = StringIO("foo,bar,baz,bam,blah") + parser = all_parsers + df = parser.read_csv(stream, header=None, names=column_names, index_col=False) + tm.assert_frame_equal(df, ref) From e734600f6eb551eb43329829d1dac6420dbe0107 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Thu, 19 Mar 2020 22:11:02 +0100 Subject: [PATCH 2/4] Add whatsnew entry --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 4044fb2d3fa09..f3289904e7930 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -349,6 +349,7 @@ I/O - Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) - Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`) - Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`) +- Bug in :meth:`read_csv` was raising an IndexError when header=None and 2 extra data columns Plotting From 77486c6e48cd4c5718d9db1825aad1387f37487c Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sun, 22 Mar 2020 20:34:28 +0100 Subject: [PATCH 3/4] Update doc/source/whatsnew/v1.1.0.rst Co-Authored-By: gfyoung --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f3289904e7930..193dda97c31b7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -349,7 +349,7 @@ I/O - Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) - Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`) - Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`) -- Bug in :meth:`read_csv` was raising an IndexError when header=None and 2 extra data columns +- Bug in :meth:`read_csv` was raising an ``IndexError`` when header=None and 2 extra data columns Plotting From 2ba6a4d2e676e621a0b4249405ab7b33f858d4de Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Sun, 22 Mar 2020 20:36:59 +0100 Subject: [PATCH 4/4] Replace "hack" comment with a more informative one --- pandas/_libs/parsers.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a0a1b4a206b9d..c6b68d9a0ab5c 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1316,7 +1316,7 @@ cdef class TextReader: else: if self.header is not None: j = i - self.leading_cols - # hack for #2442 and #26218 + # generate extra (bogus) headers if there are more columns than headers if j >= len(self.header[0]): return j else: