Properly parse unclosed tags in code spans

waylan · web-flow · commit 1279074ea978 · 2020-11-23T13:11:21.000-05:00
* fix unclosed pi in code span * fix unclosed dec in code span * fix unclosed tag in code span Closes #1066.
diff --git a/docs/change_log/index.md b/docs/change_log/index.md
@@ -5,6 +5,7 @@ Python-Markdown Change Log
 
 Under development: version 3.3.4 (a bug-fix release).
 
+* Properly parse unclosed tags in code spans (#1066).
 * Properly parse processing instructions in md_in_html (#1070).
 * Properly parse code spans in md_in_html (#1069).
 
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
@@ -206,6 +206,26 @@ def handle_empty_tag(self, data, is_block):
             else:
                 self.handle_data(self.md.htmlStash.store(data))
 
+    def parse_pi(self, i):
+        if self.at_line_start() or self.intail or self.mdstack:
+            # The same override exists in HTMLExtractor without the check
+            # for mdstack. Therefore, use HTMLExtractor's parent instead.
+            return super(HTMLExtractor, self).parse_pi(i)
+        # This is not the beginning of a raw block so treat as plain data
+        # and avoid consuming any tags which may follow (see #1066).
+        self.handle_data('<?')
+        return i + 2
+
+    def parse_html_declaration(self, i):
+        if self.at_line_start() or self.intail or self.mdstack:
+            # The same override exists in HTMLExtractor without the check
+            # for mdstack. Therefore, use HTMLExtractor's parent instead.
+            return super(HTMLExtractor, self).parse_html_declaration(i)
+        # This is not the beginning of a raw block so treat as plain data
+        # and avoid consuming any tags which may follow (see #1066).
+        self.handle_data('<!')
+        return i + 2
+
 
 class HtmlBlockPreprocessor(Preprocessor):
     """Remove html blocks from the text and store them for later retrieval."""
diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
@@ -39,6 +39,22 @@
 # so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete,
 # and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
 htmlparser.incomplete = htmlparser.entityref
+# Monkeypatch HTMLParser to not accept a backtick in a tag name, attribute name, or bare value.
+htmlparser.locatestarttagend_tolerant = re.compile(r"""
+  <[a-zA-Z][^`\t\n\r\f />\x00]*       # tag name <= added backtick here
+  (?:[\s/]*                           # optional whitespace before attribute name
+    (?:(?<=['"\s/])[^`\s/>][^\s/=>]*  # attribute name <= added backtick here
+      (?:\s*=+\s*                     # value indicator
+        (?:'[^']*'                    # LITA-enclosed value
+          |"[^"]*"                    # LIT-enclosed value
+          |(?!['"])[^`>\s]*           # bare value <= added backtick here
+         )
+         (?:\s*,)*                    # possibly followed by a comma
+       )?(?:\s|/(?!>))*
+     )*
+   )?
+  \s*                                 # trailing whitespace
+""", re.VERBOSE)
 
 # Match a blank line at the start of a block of text (two newlines).
 # The newlines may be preceded by additional whitespace.
@@ -230,6 +246,22 @@ def unknown_decl(self, data):
         end = ']]>' if data.startswith('CDATA[') else ']>'
         self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
 
+    def parse_pi(self, i):
+        if self.at_line_start() or self.intail:
+            return super().parse_pi(i)
+        # This is not the beginning of a raw block so treat as plain data
+        # and avoid consuming any tags which may follow (see #1066).
+        self.handle_data('<?')
+        return i + 2
+
+    def parse_html_declaration(self, i):
+        if self.at_line_start() or self.intail:
+            return super().parse_html_declaration(i)
+        # This is not the beginning of a raw block so treat as plain data
+        # and avoid consuming any tags which may follow (see #1066).
+        self.handle_data('<!')
+        return i + 2
+
     # The rest has been copied from base class in standard lib to address #1036.
     # As __startag_text is private, all references to it must be in this subclass.
     # The last few lines of parse_starttag are reversed so that handle_starttag
diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py
@@ -663,6 +663,48 @@ def test_raw_missing_close_bracket(self):
             '<p>&lt;foo</p>'
         )
 
+    def test_raw_unclosed_tag_in_code_span(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                `<div`.
+
+                <div>
+                hello
+                </div>
+                """
+            ),
+            self.dedent(
+                """
+                <p><code>&lt;div</code>.</p>
+                <div>
+                hello
+                </div>
+                """
+            )
+        )
+
+    def test_raw_unclosed_tag_in_code_span_space(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                ` <div `.
+
+                <div>
+                hello
+                </div>
+                """
+            ),
+            self.dedent(
+                """
+                <p><code>&lt;div</code>.</p>
+                <div>
+                hello
+                </div>
+                """
+            )
+        )
+
     def test_raw_attributes(self):
         self.assertMarkdownRenders(
             '<p id="foo", class="bar baz", style="margin: 15px; line-height: 1.5; text-align: center;">text</p>',
@@ -1073,6 +1115,27 @@ def test_raw_processing_instruction_indented(self):
             )
         )
 
+    def test_raw_processing_instruction_code_span(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                `<?php`
+
+                <div>
+                foo
+                </div>
+                """
+            ),
+            self.dedent(
+                """
+                <p><code>&lt;?php</code></p>
+                <div>
+                foo
+                </div>
+                """
+            )
+        )
+
     def test_raw_declaration_one_line(self):
         self.assertMarkdownRenders(
             '<!DOCTYPE html>',
@@ -1110,6 +1173,27 @@ def test_raw_multiline_declaration(self):
             )
         )
 
+    def test_raw_declaration_code_span(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                `<!`
+
+                <div>
+                foo
+                </div>
+                """
+            ),
+            self.dedent(
+                """
+                <p><code>&lt;!</code></p>
+                <div>
+                foo
+                </div>
+                """
+            )
+        )
+
     def test_raw_cdata_one_line(self):
         self.assertMarkdownRenders(
             '<![CDATA[ document.write(">"); ]]>',
@@ -1190,6 +1274,27 @@ def test_raw_cdata_indented(self):
             )
         )
 
+    def test_raw_cdata_code_span(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                `<![`
+
+                <div>
+                foo
+                </div>
+                """
+            ),
+            self.dedent(
+                """
+                <p><code>&lt;![</code></p>
+                <div>
+                foo
+                </div>
+                """
+            )
+        )
+
     def test_charref(self):
         self.assertMarkdownRenders(
             '&sect;',