diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 70b07e08cf760..dda7d6b7736c4 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -217,6 +217,7 @@ MultiIndex I/O ^^^ +- Bug in :func:`read_csv` interpreting ``NA`` value as comment, when ``NA`` does contain the comment string fixed for ``engine="python"`` (:issue:`34002`) - Bug in :func:`read_csv` raising ``IndexError`` with multiple header columns and ``index_col`` specified when file has no data rows (:issue:`38292`) - Bug in :func:`read_csv` not accepting ``usecols`` with different length than ``names`` for ``engine="python"`` (:issue:`16469`) - Bug in :func:`read_csv` raising ``TypeError`` when ``names`` and ``parse_dates`` is specified for ``engine="c"`` (:issue:`33699`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3244b1c0f65b2..effade6a9dd3e 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2983,7 +2983,11 @@ def _check_comments(self, lines): for line in lines: rl = [] for x in line: - if not isinstance(x, str) or self.comment not in x: + if ( + not isinstance(x, str) + or self.comment not in x + or x in self.na_values + ): rl.append(x) else: x = x[: x.find(self.comment)] diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index bddccb0334cc8..d10d8e27a59a5 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -134,3 +134,30 @@ def test_comment_first_line(all_parsers, header): result = parser.read_csv(StringIO(data), comment="#", header=header) tm.assert_frame_equal(result, expected) + + +def test_comment_char_in_default_value(all_parsers, request): + # GH#34002 + if all_parsers.engine == "c": + reason = "see gh-34002: works on the python engine but not the c engine" + # NA value containing comment char is interpreted as comment + request.node.add_marker(pytest.mark.xfail(reason=reason, raises=AssertionError)) + parser = all_parsers + + data = ( + "# this is a comment\n" + "col1,col2,col3,col4\n" + "1,2,3,4#inline comment\n" + "4,5#,6,10\n" + "7,8,#N/A,11\n" + ) + result = parser.read_csv(StringIO(data), comment="#", na_values="#N/A") + expected = DataFrame( + { + "col1": [1, 4, 7], + "col2": [2, 5, 8], + "col3": [3.0, np.nan, np.nan], + "col4": [4.0, np.nan, 11.0], + } + ) + tm.assert_frame_equal(result, expected)