BUG: fixes bug when using sep=None and comment keyword for read_csv (#31667)

s-scherrer · web-flow · commit 861df91ae9c0 · 2020-03-02T22:02:22.000-05:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -297,8 +297,10 @@ I/O
 - Bug in :meth:`DataFrame.to_parquet` overwriting pyarrow's default for
   ``coerce_timestamps``; following pyarrow's default allows writing nanosecond
   timestamps with ``version="2.0"`` (:issue:`31652`).
+- Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`)
 - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
 
+
 Plotting
 ^^^^^^^^
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2379,19 +2379,21 @@ class MyDialect(csv.Dialect):
 
             dia = MyDialect
 
-            sniff_sep = True
-
             if sep is not None:
-                sniff_sep = False
                 dia.delimiter = sep
-            # attempt to sniff the delimiter
-            if sniff_sep:
+            else:
+                # attempt to sniff the delimiter from the first valid line,
+                # i.e. no comment line and not in skiprows
                 line = f.readline()
-                while self.skipfunc(self.pos):
+                lines = self._check_comments([[line]])[0]
+                while self.skipfunc(self.pos) or not lines:
                     self.pos += 1
                     line = f.readline()
+                    lines = self._check_comments([[line]])[0]
 
-                line = self._check_comments([line])[0]
+                # since `line` was a string, lines will be a list containing
+                # only a single string
+                line = lines[0]
 
                 self.pos += 1
                 self.line_pos += 1
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
@@ -66,6 +66,24 @@ def test_sniff_delimiter(python_parser_only, kwargs):
     tm.assert_frame_equal(result, expected)
 
 
+def test_sniff_delimiter_comment(python_parser_only):
+    data = """# comment line
+index|A|B|C
+# comment line
+foo|1|2|3 # ignore | this
+bar|4|5|6
+baz|7|8|9
+"""
+    parser = python_parser_only
+    result = parser.read_csv(StringIO(data), index_col=0, sep=None, comment="#")
+    expected = DataFrame(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        columns=["A", "B", "C"],
+        index=Index(["foo", "bar", "baz"], name="index"),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
 @pytest.mark.parametrize("encoding", [None, "utf-8"])
 def test_sniff_delimiter_encoding(python_parser_only, encoding):
     parser = python_parser_only