Fix issues related to hr tags

facelessuser · web-flow · commit 11c9e179390b · 2020-10-24T21:34:51.000-04:00
Ensure that start/end tag handler does not include tags in the previous paragraph. Provide special handling for tags like hr that never have content. Use sets for block tag lists as they are much faster when comparing if an item is in the list. Fixes #1053.
diff --git a/docs/change_log/index.md b/docs/change_log/index.md
@@ -8,6 +8,7 @@ Under development: version 3.3.3 (a bug-fix release).
 * Unify all block-level tags (#1047).
 * Fix issue where some empty elements would have text rendered as `None` when using `md_in_html` (#1049).
 * Avoid catastrophic backtracking in `hr` regex (#1055).
+* Fix `hr` HTML handling (#1053).
 
 Oct 19, 2020: version 3.3.2 (a bug-fix release).
 
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
@@ -30,15 +30,19 @@ class HTMLExtractorExtra(HTMLExtractor):
 
     def __init__(self, md, *args, **kwargs):
         # All block-level tags.
-        self.block_level_tags = md.block_level_elements.copy()
+        self.block_level_tags = set(md.block_level_elements.copy())
         # Block-level tags in which the content only gets span level parsing
-        self.span_tags = ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th']
+        self.span_tags = set(
+            ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th']
+        )
         # Block-level tags which never get their content parsed.
-        self.raw_tags = ['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea']
+        self.raw_tags = set(['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea'])
         # Block-level tags in which the content gets parsed as blocks
-        self.block_tags = [tag for tag in self.block_level_tags if tag not in self.span_tags + self.raw_tags]
         super().__init__(md, *args, **kwargs)
 
+        self.block_tags = set(self.block_level_tags) - (self.span_tags | self.raw_tags | self.empty_tags)
+        self.span_and_blocks_tags = self.block_tags | self.span_tags
+
     def reset(self):
         """Reset this instance.  Loses all unprocessed data."""
         self.mdstack = []  # When markdown=1, stack contains a list of tags
@@ -71,10 +75,10 @@ def get_state(self, tag, attrs):
             # Only use the parent state if it is more restrictive than the markdown attribute.
             md_attr = parent_state
         if ((md_attr == '1' and tag in self.block_tags) or
-                (md_attr == 'block' and tag in self.span_tags + self.block_tags)):
+                (md_attr == 'block' and tag in self.span_and_blocks_tags)):
             return 'block'
         elif ((md_attr == '1' and tag in self.span_tags) or
-              (md_attr == 'span' and tag in self.span_tags + self.block_tags)):
+              (md_attr == 'span' and tag in self.span_and_blocks_tags)):
             return 'span'
         elif tag in self.block_level_tags:
             return 'off'
@@ -90,6 +94,18 @@ def at_line_start(self):
         return value
 
     def handle_starttag(self, tag, attrs):
+        # Handle tags that should always be empty and do not specify a closing tag
+        if tag in self.empty_tags:
+            attrs = {key: value if value is not None else key for key, value in attrs}
+            if "markdown" in attrs:
+                attrs.pop('markdown')
+                element = etree.Element(tag, attrs)
+                data = etree.tostring(element, encoding='unicode', method='html')
+            else:
+                data = self.get_starttag_text()
+            self.handle_empty_tag(data, True)
+            return
+
         if tag in self.block_level_tags:
             # Valueless attr (ex: `<tag checked>`) results in `[('checked', None)]`.
             # Convert to `{'checked': 'checked'}`.
@@ -161,6 +177,19 @@ def handle_endtag(self, tag):
                 else:
                     self.handle_data(text)
 
+    def handle_startendtag(self, tag, attrs):
+        if tag in self.empty_tags:
+            attrs = {key: value if value is not None else key for key, value in attrs}
+            if "markdown" in attrs:
+                attrs.pop('markdown')
+                element = etree.Element(tag, attrs)
+                data = etree.tostring(element, encoding='unicode', method='html')
+            else:
+                data = self.get_starttag_text()
+        else:
+            data = self.get_starttag_text()
+        self.handle_empty_tag(data, is_block=self.md.is_block_level(tag))
+
     def handle_data(self, data):
         if self.inraw or not self.mdstack:
             super().handle_data(data)
diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
@@ -56,6 +56,10 @@ class HTMLExtractor(htmlparser.HTMLParser):
     def __init__(self, md, *args, **kwargs):
         if 'convert_charrefs' not in kwargs:
             kwargs['convert_charrefs'] = False
+
+        # Block tags that should contain no content (self closing)
+        self.empty_tags = set(['hr'])
+
         # This calls self.reset
         super().__init__(*args, **kwargs)
         self.md = md
@@ -120,6 +124,11 @@ def get_endtag_text(self, tag):
             return '</{}>'.format(tag)
 
     def handle_starttag(self, tag, attrs):
+        # Handle tags that should always be empty and do not specify a closing tag
+        if tag in self.empty_tags:
+            self.handle_startendtag(tag, attrs)
+            return
+
         if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
             # Started a new raw block. Prepare stack.
             self.inraw = True
@@ -183,6 +192,10 @@ def handle_empty_tag(self, data, is_block):
             else:
                 # More content exists after tag.
                 self.intail = True
+            item = self.cleandoc[-1] if self.cleandoc else ''
+            # If we only have one newline before block element, add another
+            if not item.endswith('\n\n') and item.endswith('\n'):
+                self.cleandoc.append('\n')
             self.cleandoc.append(self.md.htmlStash.store(data))
             # Insert blank line between this and next line.
             self.cleandoc.append('\n\n')
diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py
@@ -1402,3 +1402,102 @@ def test_inline_script_tags(self):
                 """
             )
         )
+
+    def test_hr_only_start(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p><em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_hr_self_close(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr/>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr/>
+                <p><em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_hr_start_and_end(self):
+        # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr></hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p></hr>
+                <em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_hr_only_end(self):
+        # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                </hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em>
+                </hr>
+                <em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_hr_with_content(self):
+        # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+        # Content is not allowed and will be treated as normal content between two hr tags.
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr>
+                **content**
+                </hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p><strong>content</strong>
+                </hr>
+                <em>emphasis2</em></p>
+                """
+            )
+        )
diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py
@@ -893,6 +893,129 @@ def test_md1_nested_link_ref(self):
             )
         )
 
+    def test_md1_hr_only_start(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr markdown="1">
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p><em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_md1_hr_self_close(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr markdown="1" />
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p><em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_md1_hr_start_and_end(self):
+        # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr markdown="1"></hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p></hr>
+                <em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_md1_hr_only_end(self):
+        # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                </hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em>
+                </hr>
+                <em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_md1_hr_with_content(self):
+        # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+        # Content is not allowed and will be treated as normal content between two hr tags
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr markdown="1">
+                **content**
+                </hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p><strong>content</strong>
+                </hr>
+                <em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_no_md1_hr_with_content(self):
+        # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+        # Content is not allowed and will be treated as normal content between two hr tags
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr>
+                **content**
+                </hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p><strong>content</strong>
+                </hr>
+                <em>emphasis2</em></p>
+                """
+            )
+        )
+
     def test_md1_nested_abbr_ref(self):
         self.assertMarkdownRenders(
             self.dedent(