Skip to content

Commit 5d112a3

Browse files
author
Pankaj Pandey
committed
Fix correct warning with c engine when skipping lines
Fixed bug where c engine would not print warnings for lines it skipped in case the skipped line had an inline comment. Also, its accounting of number of fields in such lines would be off by one.
1 parent 6614e26 commit 5d112a3

File tree

2 files changed

+31
-0
lines changed

2 files changed

+31
-0
lines changed

pandas/_libs/src/parser/tokenizer.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -832,6 +832,9 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
832832
} else if (IS_CARRIAGE(c)) {
833833
self->state = EAT_CRNL;
834834
break;
835+
} else if (IS_COMMENT_CHAR(c)) {
836+
self->state = EAT_COMMENT;
837+
break;
835838
} else if (!IS_WHITESPACE(c)) {
836839
self->state = START_FIELD;
837840
// fall through to subsequent state

pandas/tests/io/parser/comment.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
for all of the parsers defined in parsers.py
66
"""
77

8+
import sys
9+
810
import numpy as np
911
import pandas.util.testing as tm
1012

@@ -116,3 +118,29 @@ def test_commment_first_line(self):
116118
expected = DataFrame({0: ['a', '1'], 1: ['b', '2'], 2: ['c', '3']})
117119
result = self.read_csv(StringIO(data), comment='#', header=None)
118120
tm.assert_frame_equal(result, expected)
121+
122+
def test_comment_whitespace_delimited(self):
123+
test_input = """\
124+
1 2
125+
2 2 3
126+
3 2 3 # 3 fields
127+
4 2 3# 3 fields
128+
5 2 # 2 fields
129+
6 2# 2 fields
130+
7 # 1 field, NaN
131+
8# 1 field, NaN
132+
9 2 3 # skipped line
133+
# comment"""
134+
captured_err = StringIO()
135+
orig_stderr, sys.stderr = sys.stderr, captured_err
136+
try:
137+
df = self.read_csv(StringIO(test_input), comment='#', header=None,
138+
delimiter='\\s+', skiprows=0,
139+
error_bad_lines=False)
140+
finally:
141+
sys.stderr = orig_stderr
142+
content = captured_err.getvalue()
143+
# skipped lines 2, 3, 4, 9
144+
assert content.count('Skipping line') == 4, content
145+
expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]])
146+
tm.assert_frame_equal(df, expected)

0 commit comments

Comments
 (0)