From 02f9daeb6fa1b2082894a24359e14d3441234e27 Mon Sep 17 00:00:00 2001 From: Pankaj Pandey Date: Tue, 23 May 2017 20:07:18 +0530 Subject: [PATCH 1/5] Fix correct warning with c engine when skipping lines Fixed bug where c engine would not print warnings for lines it skipped in case the skipped line had an inline comment. Also, its accounting of number of fields in such lines would be off by one. --- pandas/_libs/src/parser/tokenizer.c | 3 +++ pandas/tests/io/parser/comment.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 6b0775e54da0c..be23ebb023383 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -832,6 +832,9 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { } else if (IS_CARRIAGE(c)) { self->state = EAT_CRNL; break; + } else if (IS_COMMENT_CHAR(c)) { + self->state = EAT_COMMENT; + break; } else if (!IS_WHITESPACE(c)) { self->state = START_FIELD; // fall through to subsequent state diff --git a/pandas/tests/io/parser/comment.py b/pandas/tests/io/parser/comment.py index 9987a017cf985..0e39d9152db58 100644 --- a/pandas/tests/io/parser/comment.py +++ b/pandas/tests/io/parser/comment.py @@ -5,6 +5,8 @@ for all of the parsers defined in parsers.py """ +import sys + import numpy as np import pandas.util.testing as tm @@ -116,3 +118,29 @@ def test_commment_first_line(self): expected = DataFrame({0: ['a', '1'], 1: ['b', '2'], 2: ['c', '3']}) result = self.read_csv(StringIO(data), comment='#', header=None) tm.assert_frame_equal(result, expected) + + def test_comment_whitespace_delimited(self): + test_input = """\ +1 2 +2 2 3 +3 2 3 # 3 fields +4 2 3# 3 fields +5 2 # 2 fields +6 2# 2 fields +7 # 1 field, NaN +8# 1 field, NaN +9 2 3 # skipped line +# comment""" + captured_err = StringIO() + orig_stderr, sys.stderr = sys.stderr, captured_err + try: + df = self.read_csv(StringIO(test_input), comment='#', header=None, + delimiter='\\s+', skiprows=0, + error_bad_lines=False) + finally: + sys.stderr = orig_stderr + content = captured_err.getvalue() + # skipped lines 2, 3, 4, 9 + assert content.count('Skipping line') == 4, content + expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]]) + tm.assert_frame_equal(df, expected) From be08bda7fcdfe8886734abfec906b7895abbc637 Mon Sep 17 00:00:00 2001 From: Pankaj Pandey Date: Wed, 24 May 2017 12:21:10 +0530 Subject: [PATCH 2/5] Use `tm.capture_stderr` to capture stderr --- pandas/tests/io/parser/comment.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/parser/comment.py b/pandas/tests/io/parser/comment.py index 0e39d9152db58..266b7cef2c9db 100644 --- a/pandas/tests/io/parser/comment.py +++ b/pandas/tests/io/parser/comment.py @@ -119,6 +119,7 @@ def test_commment_first_line(self): result = self.read_csv(StringIO(data), comment='#', header=None) tm.assert_frame_equal(result, expected) + @tm.capture_stderr def test_comment_whitespace_delimited(self): test_input = """\ 1 2 @@ -131,16 +132,12 @@ def test_comment_whitespace_delimited(self): 8# 1 field, NaN 9 2 3 # skipped line # comment""" - captured_err = StringIO() - orig_stderr, sys.stderr = sys.stderr, captured_err - try: - df = self.read_csv(StringIO(test_input), comment='#', header=None, - delimiter='\\s+', skiprows=0, - error_bad_lines=False) - finally: - sys.stderr = orig_stderr - content = captured_err.getvalue() + df = self.read_csv(StringIO(test_input), comment='#', header=None, + delimiter='\\s+', skiprows=0, + error_bad_lines=False) + error = sys.stderr.getvalue() # skipped lines 2, 3, 4, 9 - assert content.count('Skipping line') == 4, content + for line_num in (2, 3, 4, 9): + assert 'Skipping line {}'.format(line_num) in error, error expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]]) tm.assert_frame_equal(df, expected) From 7b571baadcbfc241edc3a108f8097d9e5b3d81ca Mon Sep 17 00:00:00 2001 From: Pankaj Pandey Date: Wed, 24 May 2017 12:26:47 +0530 Subject: [PATCH 3/5] Add bug fix note in `whatsnew/v0.20.3.txt` --- doc/source/whatsnew/v0.20.2.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 6d6a148ed025f..6149fb0c392c1 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -53,6 +53,7 @@ Indexing I/O ^^^ +- Bug in skipping error lines with inline comments in space delimited text files (:issue:`16472`) - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) - Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`) From 44f4f00ff5b81888c4fa2d7502c44b74dea55fc5 Mon Sep 17 00:00:00 2001 From: Pankaj Pandey Date: Wed, 24 May 2017 12:55:44 +0530 Subject: [PATCH 4/5] Move test to CParserTests The behavior is only applicable on the `c` engine. --- pandas/tests/io/parser/c_parser_only.py | 29 +++++++++++++++++++++++++ pandas/tests/io/parser/comment.py | 25 --------------------- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py index 3e7a648474bc3..56ac10404b7b2 100644 --- a/pandas/tests/io/parser/c_parser_only.py +++ b/pandas/tests/io/parser/c_parser_only.py @@ -7,6 +7,8 @@ further arguments when parsing. """ +import sys + import pytest import numpy as np @@ -417,3 +419,30 @@ def test_data_after_quote(self): expected = DataFrame({'a': ['1', 'ba']}) tm.assert_frame_equal(result, expected) + + @tm.capture_stderr + def test_comment_whitespace_delimited(self): + test_input = """\ +1 2 +2 2 3 +3 2 3 # 3 fields +4 2 3# 3 fields +5 2 # 2 fields +6 2# 2 fields +7 # 1 field, NaN +8# 1 field, NaN +9 2 3 # skipped line +# comment""" + df = self.read_csv(StringIO(test_input), comment='#', header=None, + delimiter='\\s+', skiprows=0, + error_bad_lines=False) + error = sys.stderr.getvalue() + # skipped lines 2, 3, 4, 9 + for line_num in (2, 3, 4, 9): + assert 'Skipping line {}'.format(line_num) in error, error + expected = DataFrame([[1, 2], + [5, 2], + [6, 2], + [7, np.nan], + [8, np.nan]]) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/parser/comment.py b/pandas/tests/io/parser/comment.py index 266b7cef2c9db..9987a017cf985 100644 --- a/pandas/tests/io/parser/comment.py +++ b/pandas/tests/io/parser/comment.py @@ -5,8 +5,6 @@ for all of the parsers defined in parsers.py """ -import sys - import numpy as np import pandas.util.testing as tm @@ -118,26 +116,3 @@ def test_commment_first_line(self): expected = DataFrame({0: ['a', '1'], 1: ['b', '2'], 2: ['c', '3']}) result = self.read_csv(StringIO(data), comment='#', header=None) tm.assert_frame_equal(result, expected) - - @tm.capture_stderr - def test_comment_whitespace_delimited(self): - test_input = """\ -1 2 -2 2 3 -3 2 3 # 3 fields -4 2 3# 3 fields -5 2 # 2 fields -6 2# 2 fields -7 # 1 field, NaN -8# 1 field, NaN -9 2 3 # skipped line -# comment""" - df = self.read_csv(StringIO(test_input), comment='#', header=None, - delimiter='\\s+', skiprows=0, - error_bad_lines=False) - error = sys.stderr.getvalue() - # skipped lines 2, 3, 4, 9 - for line_num in (2, 3, 4, 9): - assert 'Skipping line {}'.format(line_num) in error, error - expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]]) - tm.assert_frame_equal(df, expected) From 6fd269de25ab0ab18216a257a7c921ec8350622b Mon Sep 17 00:00:00 2001 From: Pankaj Pandey Date: Wed, 24 May 2017 17:57:02 +0530 Subject: [PATCH 5/5] Update whatsnew bug entry as per review --- doc/source/whatsnew/v0.20.2.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 6149fb0c392c1..6c9728191f5b6 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -53,7 +53,7 @@ Indexing I/O ^^^ -- Bug in skipping error lines with inline comments in space delimited text files (:issue:`16472`) +- Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`) - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) - Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`)