From e67893f5ab2b03040889bc44fa85d77c5c82d662 Mon Sep 17 00:00:00 2001 From: Evan Wright Date: Wed, 8 Apr 2015 08:15:25 -0400 Subject: [PATCH] BUG: skiprows doesn't handle blank lines properly when engine='c' (GH #9832) --- doc/source/whatsnew/v0.16.1.txt | 1 + pandas/io/tests/test_parsers.py | 22 ++++++++++++++++++++++ pandas/src/parser/tokenizer.c | 18 ++++++------------ 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index bd79d9d93fd04..54892a35462d5 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -88,3 +88,4 @@ Bug Fixes - Bug in ``FloatArrayFormatter`` where decision boundary for displaying "small" floats in decimal format is off by one order of magnitude for a given display.precision (:issue:`9764`) - Fixed bug where ``DataFrame.plot()`` raised an error when both ``color`` and ``style`` keywords were passed and there was no color symbol in the style strings (:issue:`9671`) +- Bug in ``read_csv`` and ``read_table`` when using ``skip_rows`` parameter if blank lines are present. (:issue:`9832`) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 35530a7f5e07f..e549ec674b18d 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -839,6 +839,28 @@ def test_deep_skiprows(self): condensed_data = self.read_csv(StringIO(condensed_text)) tm.assert_frame_equal(data, condensed_data) + def test_skiprows_blank(self): + # GH 9832 + text = """#foo,a,b,c +#foo,a,b,c + +#foo,a,b,c +#foo,a,b,c + +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + data = self.read_csv(StringIO(text), skiprows=6, header=None, + index_col=0, parse_dates=True) + + expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), + columns=[1, 2, 3], + index=[datetime(2000, 1, 1), datetime(2000, 1, 2), + datetime(2000, 1, 3)]) + expected.index.name = 0 + tm.assert_frame_equal(data, expected) + def test_detect_string_na(self): data = """A,B foo,bar diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 975142ebacc2a..1bc4096658b29 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -757,11 +757,9 @@ int tokenize_delimited(parser_t *self, size_t line_limit) case START_RECORD: // start of record if (skip_this_line(self, self->file_lines)) { + self->state = SKIP_LINE; if (c == '\n') { - END_LINE() - } - else { - self->state = SKIP_LINE; + END_LINE(); } break; } @@ -1093,11 +1091,9 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) case START_RECORD: // start of record if (skip_this_line(self, self->file_lines)) { + self->state = SKIP_LINE; if (c == self->lineterminator) { - END_LINE() - } - else { - self->state = SKIP_LINE; + END_LINE(); } break; } @@ -1391,11 +1387,9 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) case START_RECORD: // start of record if (skip_this_line(self, self->file_lines)) { + self->state = SKIP_LINE; if (c == '\n') { - END_LINE() - } - else { - self->state = SKIP_LINE; + END_LINE(); } break; } else if (c == '\n') {