diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 21e59805fa143..36cfb4a904139 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -340,6 +340,7 @@ I/O - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) - Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) - Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) +- Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`) Plotting diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 2188ff6b0d464..7ba1a6cd398c9 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1189,8 +1189,14 @@ int parser_consume_rows(parser_t *self, size_t nrows) { /* cannot guarantee that nrows + 1 has been observed */ word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1]; - char_count = (self->word_starts[word_deletions - 1] + - strlen(self->words[word_deletions - 1]) + 1); + if (word_deletions >= 1) { + char_count = (self->word_starts[word_deletions - 1] + + strlen(self->words[word_deletions - 1]) + 1); + } else { + /* if word_deletions == 0 (i.e. this case) then char_count must + * be 0 too, as no data needs to be skipped */ + char_count = 0; + } TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, char_count)); diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 33460262a4430..0f3a5be76ae60 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2093,3 +2093,16 @@ def test(): parser.read_csv(path) td.check_file_leaks(test)() + + +@pytest.mark.parametrize("nrows", range(1, 6)) +def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): + # GH 28071 + ref = DataFrame( + [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]], + columns=list("ab"), + ) + csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4" + parser = all_parsers + df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) + tm.assert_frame_equal(df, ref[:nrows])