Skip to content

Commit 11c9e17

Browse files
authored
Fix issues related to hr tags
Ensure that start/end tag handler does not include tags in the previous paragraph. Provide special handling for tags like hr that never have content. Use sets for block tag lists as they are much faster when comparing if an item is in the list. Fixes #1053.
1 parent 18b17e1 commit 11c9e17

File tree

5 files changed

+271
-6
lines changed

5 files changed

+271
-6
lines changed

docs/change_log/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Under development: version 3.3.3 (a bug-fix release).
88
* Unify all block-level tags (#1047).
99
* Fix issue where some empty elements would have text rendered as `None` when using `md_in_html` (#1049).
1010
* Avoid catastrophic backtracking in `hr` regex (#1055).
11+
* Fix `hr` HTML handling (#1053).
1112

1213
Oct 19, 2020: version 3.3.2 (a bug-fix release).
1314

markdown/extensions/md_in_html.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,19 @@ class HTMLExtractorExtra(HTMLExtractor):
3030

3131
def __init__(self, md, *args, **kwargs):
3232
# All block-level tags.
33-
self.block_level_tags = md.block_level_elements.copy()
33+
self.block_level_tags = set(md.block_level_elements.copy())
3434
# Block-level tags in which the content only gets span level parsing
35-
self.span_tags = ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th']
35+
self.span_tags = set(
36+
['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th']
37+
)
3638
# Block-level tags which never get their content parsed.
37-
self.raw_tags = ['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea']
39+
self.raw_tags = set(['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea'])
3840
# Block-level tags in which the content gets parsed as blocks
39-
self.block_tags = [tag for tag in self.block_level_tags if tag not in self.span_tags + self.raw_tags]
4041
super().__init__(md, *args, **kwargs)
4142

43+
self.block_tags = set(self.block_level_tags) - (self.span_tags | self.raw_tags | self.empty_tags)
44+
self.span_and_blocks_tags = self.block_tags | self.span_tags
45+
4246
def reset(self):
4347
"""Reset this instance. Loses all unprocessed data."""
4448
self.mdstack = [] # When markdown=1, stack contains a list of tags
@@ -71,10 +75,10 @@ def get_state(self, tag, attrs):
7175
# Only use the parent state if it is more restrictive than the markdown attribute.
7276
md_attr = parent_state
7377
if ((md_attr == '1' and tag in self.block_tags) or
74-
(md_attr == 'block' and tag in self.span_tags + self.block_tags)):
78+
(md_attr == 'block' and tag in self.span_and_blocks_tags)):
7579
return 'block'
7680
elif ((md_attr == '1' and tag in self.span_tags) or
77-
(md_attr == 'span' and tag in self.span_tags + self.block_tags)):
81+
(md_attr == 'span' and tag in self.span_and_blocks_tags)):
7882
return 'span'
7983
elif tag in self.block_level_tags:
8084
return 'off'
@@ -90,6 +94,18 @@ def at_line_start(self):
9094
return value
9195

9296
def handle_starttag(self, tag, attrs):
97+
# Handle tags that should always be empty and do not specify a closing tag
98+
if tag in self.empty_tags:
99+
attrs = {key: value if value is not None else key for key, value in attrs}
100+
if "markdown" in attrs:
101+
attrs.pop('markdown')
102+
element = etree.Element(tag, attrs)
103+
data = etree.tostring(element, encoding='unicode', method='html')
104+
else:
105+
data = self.get_starttag_text()
106+
self.handle_empty_tag(data, True)
107+
return
108+
93109
if tag in self.block_level_tags:
94110
# Valueless attr (ex: `<tag checked>`) results in `[('checked', None)]`.
95111
# Convert to `{'checked': 'checked'}`.
@@ -161,6 +177,19 @@ def handle_endtag(self, tag):
161177
else:
162178
self.handle_data(text)
163179

180+
def handle_startendtag(self, tag, attrs):
181+
if tag in self.empty_tags:
182+
attrs = {key: value if value is not None else key for key, value in attrs}
183+
if "markdown" in attrs:
184+
attrs.pop('markdown')
185+
element = etree.Element(tag, attrs)
186+
data = etree.tostring(element, encoding='unicode', method='html')
187+
else:
188+
data = self.get_starttag_text()
189+
else:
190+
data = self.get_starttag_text()
191+
self.handle_empty_tag(data, is_block=self.md.is_block_level(tag))
192+
164193
def handle_data(self, data):
165194
if self.inraw or not self.mdstack:
166195
super().handle_data(data)

markdown/htmlparser.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ class HTMLExtractor(htmlparser.HTMLParser):
5656
def __init__(self, md, *args, **kwargs):
5757
if 'convert_charrefs' not in kwargs:
5858
kwargs['convert_charrefs'] = False
59+
60+
# Block tags that should contain no content (self closing)
61+
self.empty_tags = set(['hr'])
62+
5963
# This calls self.reset
6064
super().__init__(*args, **kwargs)
6165
self.md = md
@@ -120,6 +124,11 @@ def get_endtag_text(self, tag):
120124
return '</{}>'.format(tag)
121125

122126
def handle_starttag(self, tag, attrs):
127+
# Handle tags that should always be empty and do not specify a closing tag
128+
if tag in self.empty_tags:
129+
self.handle_startendtag(tag, attrs)
130+
return
131+
123132
if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
124133
# Started a new raw block. Prepare stack.
125134
self.inraw = True
@@ -183,6 +192,10 @@ def handle_empty_tag(self, data, is_block):
183192
else:
184193
# More content exists after tag.
185194
self.intail = True
195+
item = self.cleandoc[-1] if self.cleandoc else ''
196+
# If we only have one newline before block element, add another
197+
if not item.endswith('\n\n') and item.endswith('\n'):
198+
self.cleandoc.append('\n')
186199
self.cleandoc.append(self.md.htmlStash.store(data))
187200
# Insert blank line between this and next line.
188201
self.cleandoc.append('\n\n')

tests/test_syntax/blocks/test_html_blocks.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1402,3 +1402,102 @@ def test_inline_script_tags(self):
14021402
"""
14031403
)
14041404
)
1405+
1406+
def test_hr_only_start(self):
1407+
self.assertMarkdownRenders(
1408+
self.dedent(
1409+
"""
1410+
*emphasis1*
1411+
<hr>
1412+
*emphasis2*
1413+
"""
1414+
),
1415+
self.dedent(
1416+
"""
1417+
<p><em>emphasis1</em></p>
1418+
<hr>
1419+
<p><em>emphasis2</em></p>
1420+
"""
1421+
)
1422+
)
1423+
1424+
def test_hr_self_close(self):
1425+
self.assertMarkdownRenders(
1426+
self.dedent(
1427+
"""
1428+
*emphasis1*
1429+
<hr/>
1430+
*emphasis2*
1431+
"""
1432+
),
1433+
self.dedent(
1434+
"""
1435+
<p><em>emphasis1</em></p>
1436+
<hr/>
1437+
<p><em>emphasis2</em></p>
1438+
"""
1439+
)
1440+
)
1441+
1442+
def test_hr_start_and_end(self):
1443+
# Browers ignore ending hr tags, so we don't try to do anything to handle them special.
1444+
self.assertMarkdownRenders(
1445+
self.dedent(
1446+
"""
1447+
*emphasis1*
1448+
<hr></hr>
1449+
*emphasis2*
1450+
"""
1451+
),
1452+
self.dedent(
1453+
"""
1454+
<p><em>emphasis1</em></p>
1455+
<hr>
1456+
<p></hr>
1457+
<em>emphasis2</em></p>
1458+
"""
1459+
)
1460+
)
1461+
1462+
def test_hr_only_end(self):
1463+
# Browers ignore ending hr tags, so we don't try to do anything to handle them special.
1464+
self.assertMarkdownRenders(
1465+
self.dedent(
1466+
"""
1467+
*emphasis1*
1468+
</hr>
1469+
*emphasis2*
1470+
"""
1471+
),
1472+
self.dedent(
1473+
"""
1474+
<p><em>emphasis1</em>
1475+
</hr>
1476+
<em>emphasis2</em></p>
1477+
"""
1478+
)
1479+
)
1480+
1481+
def test_hr_with_content(self):
1482+
# Browers ignore ending hr tags, so we don't try to do anything to handle them special.
1483+
# Content is not allowed and will be treated as normal content between two hr tags.
1484+
self.assertMarkdownRenders(
1485+
self.dedent(
1486+
"""
1487+
*emphasis1*
1488+
<hr>
1489+
**content**
1490+
</hr>
1491+
*emphasis2*
1492+
"""
1493+
),
1494+
self.dedent(
1495+
"""
1496+
<p><em>emphasis1</em></p>
1497+
<hr>
1498+
<p><strong>content</strong>
1499+
</hr>
1500+
<em>emphasis2</em></p>
1501+
"""
1502+
)
1503+
)

tests/test_syntax/extensions/test_md_in_html.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -893,6 +893,129 @@ def test_md1_nested_link_ref(self):
893893
)
894894
)
895895

896+
def test_md1_hr_only_start(self):
897+
self.assertMarkdownRenders(
898+
self.dedent(
899+
"""
900+
*emphasis1*
901+
<hr markdown="1">
902+
*emphasis2*
903+
"""
904+
),
905+
self.dedent(
906+
"""
907+
<p><em>emphasis1</em></p>
908+
<hr>
909+
<p><em>emphasis2</em></p>
910+
"""
911+
)
912+
)
913+
914+
def test_md1_hr_self_close(self):
915+
self.assertMarkdownRenders(
916+
self.dedent(
917+
"""
918+
*emphasis1*
919+
<hr markdown="1" />
920+
*emphasis2*
921+
"""
922+
),
923+
self.dedent(
924+
"""
925+
<p><em>emphasis1</em></p>
926+
<hr>
927+
<p><em>emphasis2</em></p>
928+
"""
929+
)
930+
)
931+
932+
def test_md1_hr_start_and_end(self):
933+
# Browers ignore ending hr tags, so we don't try to do anything to handle them special.
934+
self.assertMarkdownRenders(
935+
self.dedent(
936+
"""
937+
*emphasis1*
938+
<hr markdown="1"></hr>
939+
*emphasis2*
940+
"""
941+
),
942+
self.dedent(
943+
"""
944+
<p><em>emphasis1</em></p>
945+
<hr>
946+
<p></hr>
947+
<em>emphasis2</em></p>
948+
"""
949+
)
950+
)
951+
952+
def test_md1_hr_only_end(self):
953+
# Browers ignore ending hr tags, so we don't try to do anything to handle them special.
954+
self.assertMarkdownRenders(
955+
self.dedent(
956+
"""
957+
*emphasis1*
958+
</hr>
959+
*emphasis2*
960+
"""
961+
),
962+
self.dedent(
963+
"""
964+
<p><em>emphasis1</em>
965+
</hr>
966+
<em>emphasis2</em></p>
967+
"""
968+
)
969+
)
970+
971+
def test_md1_hr_with_content(self):
972+
# Browers ignore ending hr tags, so we don't try to do anything to handle them special.
973+
# Content is not allowed and will be treated as normal content between two hr tags
974+
self.assertMarkdownRenders(
975+
self.dedent(
976+
"""
977+
*emphasis1*
978+
<hr markdown="1">
979+
**content**
980+
</hr>
981+
*emphasis2*
982+
"""
983+
),
984+
self.dedent(
985+
"""
986+
<p><em>emphasis1</em></p>
987+
<hr>
988+
<p><strong>content</strong>
989+
</hr>
990+
<em>emphasis2</em></p>
991+
"""
992+
)
993+
)
994+
995+
def test_no_md1_hr_with_content(self):
996+
# Browers ignore ending hr tags, so we don't try to do anything to handle them special.
997+
# Content is not allowed and will be treated as normal content between two hr tags
998+
self.assertMarkdownRenders(
999+
self.dedent(
1000+
"""
1001+
*emphasis1*
1002+
<hr>
1003+
**content**
1004+
</hr>
1005+
*emphasis2*
1006+
"""
1007+
),
1008+
self.dedent(
1009+
"""
1010+
<p><em>emphasis1</em></p>
1011+
<hr>
1012+
<p><strong>content</strong>
1013+
</hr>
1014+
<em>emphasis2</em></p>
1015+
"""
1016+
)
1017+
)
1018+
8961019
def test_md1_nested_abbr_ref(self):
8971020
self.assertMarkdownRenders(
8981021
self.dedent(

0 commit comments

Comments
 (0)