diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 6d6a148ed025f..6c9728191f5b6 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -53,6 +53,7 @@ Indexing I/O ^^^ +- Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`) - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) - Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 6b0775e54da0c..be23ebb023383 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -832,6 +832,9 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { } else if (IS_CARRIAGE(c)) { self->state = EAT_CRNL; break; + } else if (IS_COMMENT_CHAR(c)) { + self->state = EAT_COMMENT; + break; } else if (!IS_WHITESPACE(c)) { self->state = START_FIELD; // fall through to subsequent state diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py index 3e7a648474bc3..56ac10404b7b2 100644 --- a/pandas/tests/io/parser/c_parser_only.py +++ b/pandas/tests/io/parser/c_parser_only.py @@ -7,6 +7,8 @@ further arguments when parsing. """ +import sys + import pytest import numpy as np @@ -417,3 +419,30 @@ def test_data_after_quote(self): expected = DataFrame({'a': ['1', 'ba']}) tm.assert_frame_equal(result, expected) + + @tm.capture_stderr + def test_comment_whitespace_delimited(self): + test_input = """\ +1 2 +2 2 3 +3 2 3 # 3 fields +4 2 3# 3 fields +5 2 # 2 fields +6 2# 2 fields +7 # 1 field, NaN +8# 1 field, NaN +9 2 3 # skipped line +# comment""" + df = self.read_csv(StringIO(test_input), comment='#', header=None, + delimiter='\\s+', skiprows=0, + error_bad_lines=False) + error = sys.stderr.getvalue() + # skipped lines 2, 3, 4, 9 + for line_num in (2, 3, 4, 9): + assert 'Skipping line {}'.format(line_num) in error, error + expected = DataFrame([[1, 2], + [5, 2], + [6, 2], + [7, np.nan], + [8, np.nan]]) + tm.assert_frame_equal(df, expected)