fix: certain strange characters caused reporting to fail. #1512

nedbat · nedbat · commit 35e249ff74cf · 2022-12-23T10:07:53.000-05:00
It turns out that str.splitlines() will break text on some characters
that file.readline() does not!  Use readline() to read source files the
same way that Python does.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -23,9 +23,13 @@ Unreleased
 - File pattern rules were too strict, forbidding plus signs and curly braces in
   directory and file names.  This is now fixed, closing `issue 1513`_.
 
+- Unusual Unicode or control characters in source files could prevent
+  reporting.  This is now fixed, closing `issue 1512`_.
+
 - The PyPy wheel now installs on PyPy 3.7, 3.8, and 3.9, closing `issue 1510`_.
 
 .. _issue 1510: https://github.com/nedbat/coveragepy/issues/1510
+.. _issue 1512: https://github.com/nedbat/coveragepy/issues/1512
 .. _issue 1513: https://github.com/nedbat/coveragepy/issues/1513
 
 
diff --git a/coverage/phystokens.py b/coverage/phystokens.py
@@ -4,6 +4,7 @@
 """Better tokenizing for coverage.py."""
 
 import ast
+import io
 import keyword
 import re
 import token
@@ -172,7 +173,7 @@ def generate_tokens(self, text):
         """A stand-in for `tokenize.generate_tokens`."""
         if text != self.last_text:
             self.last_text = text
-            readline = iter(text.splitlines(True)).__next__
+            readline = io.StringIO(text).readline
             try:
                 self.last_tokens = list(tokenize.generate_tokens(readline))
             except:
diff --git a/tests/test_html.py b/tests/test_html.py
@@ -469,6 +469,38 @@ def test_formfeeds(self):
         formfeed_html = self.get_html_report_content("formfeed.py")
         assert "line_two" in formfeed_html
 
+    def test_splitlines_special_chars(self):
+        # https://github.com/nedbat/coveragepy/issues/1512
+        # See https://docs.python.org/3/library/stdtypes.html#str.splitlines for
+        # the characters splitlines treats specially that readlines does not.
+
+        # I'm not exactly sure why we need the "a" strings here, but the old
+        # code wasn't failing without them.
+        self.make_file("splitlines_is_weird.py", """\
+            test = {
+                "0b": ["\x0b0"], "a1": "this is line 2",
+                "0c": ["\x0c0"], "a2": "this is line 3",
+                "1c": ["\x1c0"], "a3": "this is line 4",
+                "1d": ["\x1d0"], "a4": "this is line 5",
+                "1e": ["\x1e0"], "a5": "this is line 6",
+                "85": ["\x850"], "a6": "this is line 7",
+                "2028": ["\u20280"], "a7": "this is line 8",
+                "2029": ["\u20290"], "a8": "this is line 9",
+            }
+            DONE = 1
+            """)
+        cov = coverage.Coverage()
+        self.start_import_stop(cov, "splitlines_is_weird")
+        cov.html_report()
+
+        the_html = self.get_html_report_content("splitlines_is_weird.py")
+        assert "DONE" in the_html
+
+        # Check that the lines are properly decoded and reported...
+        html_lines = the_html.split("\n")
+        assert any(re.search(r'id="t2".*"this is line 2"', line) for line in html_lines)
+        assert any(re.search(r'id="t9".*"this is line 9"', line) for line in html_lines)
+
 
 class HtmlTest(HtmlTestHelpers, CoverageTest):
     """Moar HTML tests."""