pandas-dev · TomAugspurger · May 24, 2017 · May 23, 2017 · May 24, 2017 · May 24, 2017
diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt
@@ -53,6 +53,7 @@ Indexing
 I/O
 ^^^
 
+- Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`)
 - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
 - Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`)
 

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -832,6 +832,9 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
                 } else if (IS_CARRIAGE(c)) {
                     self->state = EAT_CRNL;
                     break;
+                } else if (IS_COMMENT_CHAR(c)) {
+                    self->state = EAT_COMMENT;
+                    break;
                 } else if (!IS_WHITESPACE(c)) {
                     self->state = START_FIELD;
                     // fall through to subsequent state

diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py
@@ -7,6 +7,8 @@
 further arguments when parsing.
 """
 
+import sys
+
 import pytest
 import numpy as np
 
@@ -417,3 +419,30 @@ def test_data_after_quote(self):
         expected = DataFrame({'a': ['1', 'ba']})
 
         tm.assert_frame_equal(result, expected)
+
+    @tm.capture_stderr
+    def test_comment_whitespace_delimited(self):
+        test_input = """\
+1 2
+2 2 3
+3 2 3 # 3 fields
+4 2 3# 3 fields
+5 2 # 2 fields
+6 2# 2 fields
+7 # 1 field, NaN
+8# 1 field, NaN
+9 2 3 # skipped line
+# comment"""
+        df = self.read_csv(StringIO(test_input), comment='#', header=None,
+                           delimiter='\\s+', skiprows=0,
+                           error_bad_lines=False)
+        error = sys.stderr.getvalue()
+        # skipped lines 2, 3, 4, 9
+        for line_num in (2, 3, 4, 9):
+            assert 'Skipping line {}'.format(line_num) in error, error
+        expected = DataFrame([[1, 2],
+                              [5, 2],
+                              [6, 2],
+                              [7, np.nan],
+                              [8, np.nan]])
+        tm.assert_frame_equal(df, expected)
-Original file line number
+Diff line change
@@ Expand Up / @@ -53,6 +53,7 @@ Indexing @@
     I/O
     ^^^
+    - Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`)
     - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
     - Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`)
@@ Expand Down @@