From 14f5454fd235f045b6781d12adec5d586b2422e5 Mon Sep 17 00:00:00 2001 From: Joel Sonoda Date: Tue, 18 Oct 2022 20:21:59 -0600 Subject: [PATCH 1/3] BUG: CSV C engine raises an error on single line CSV with no header when passing extra names (#47566) * Expect the provided number of columns when the names property is set * Add tests to demonstrate handling of files with a single row with fewer columns. --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/parsers.pyx | 3 +++ .../io/parser/common/test_common_basic.py | 20 +++++++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 5e8677e2ae7a6..bde626e7e8ac3 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -252,6 +252,7 @@ MultiIndex I/O ^^^ - Bug in :func:`read_sas` caused fragmentation of :class:`DataFrame` and raised :class:`.errors.PerformanceWarning` (:issue:`48595`) +- Bug in :func:`read_csv` for a single-line csv with fewer columns than :parameter:`names` raised :class:`.errors.ParserError` r ``engine="c"`` (:issue:`47566`) - Period diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 51294037f4cd7..b30e0ff8b099e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -744,6 +744,8 @@ cdef class TextReader: elif self.names is not None: # Names passed if self.parser.lines < 1: + if not self.has_usecols: + self.parser.expected_fields = len(self.names) self._tokenize_rows(1) header = [self.names] @@ -756,6 +758,7 @@ cdef class TextReader: # Enforce this unless usecols if not self.has_usecols: self.parser.expected_fields = max(field_count, len(self.names)) + else: # No header passed nor to be found in the file if self.parser.lines < 1: diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 359b059252556..31ef22f5e339c 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -919,6 +919,26 @@ def test_malformed_second_line(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow +def test_short_single_line(all_parsers): + parser = all_parsers + columns = ["a", "b", "c"] + data = "1,2" + result = parser.read_csv(StringIO(data), header=None, names=columns) + expected = DataFrame({"a": [1], "b": [2], "c": [np.nan]}) + tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_short_multi_line(all_parsers): + parser = all_parsers + columns = ["a", "b", "c"] + data = "1,2\n1,2" + result = parser.read_csv(StringIO(data), header=None, names=columns) + expected = DataFrame({"a": [1, 1], "b": [2, 2], "c": [np.nan, np.nan]}) + tm.assert_frame_equal(result, expected) + + def test_read_table_posargs_deprecation(all_parsers): # https://github.com/pandas-dev/pandas/issues/41485 data = StringIO("a\tb\n1\t2") From 5a63da00888d1e08097e9e6cbcd6b5ce5abd3d45 Mon Sep 17 00:00:00 2001 From: Joel Sonoda Date: Wed, 19 Oct 2022 07:14:30 -0600 Subject: [PATCH 2/3] bug-47566: fix formatting and typos in whatsnew --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index bde626e7e8ac3..742b759c6b14f 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -252,7 +252,7 @@ MultiIndex I/O ^^^ - Bug in :func:`read_sas` caused fragmentation of :class:`DataFrame` and raised :class:`.errors.PerformanceWarning` (:issue:`48595`) -- Bug in :func:`read_csv` for a single-line csv with fewer columns than :parameter:`names` raised :class:`.errors.ParserError` r ``engine="c"`` (:issue:`47566`) +- Bug in :func:`read_csv` for a single-line csv with fewer columns than ``names`` raised :class:`.errors.ParserError` with ``engine="c"`` (:issue:`47566`) - Period From 9ecc4b0655b6bbdc59b35eb8f2c9b8c8381e0248 Mon Sep 17 00:00:00 2001 From: Joel Sonoda Date: Wed, 19 Oct 2022 10:06:24 -0600 Subject: [PATCH 3/3] bug-47566: added the reference to the gh issue number on the added tests --- pandas/tests/io/parser/common/test_common_basic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 31ef22f5e339c..52d8abe76ecbc 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -921,6 +921,7 @@ def test_malformed_second_line(all_parsers): @xfail_pyarrow def test_short_single_line(all_parsers): + # GH 47566 parser = all_parsers columns = ["a", "b", "c"] data = "1,2" @@ -931,6 +932,7 @@ def test_short_single_line(all_parsers): @xfail_pyarrow def test_short_multi_line(all_parsers): + # GH 47566 parser = all_parsers columns = ["a", "b", "c"] data = "1,2\n1,2"