Skip to content

Commit 1279074

Browse files
authored
Properly parse unclosed tags in code spans
* fix unclosed pi in code span * fix unclosed dec in code span * fix unclosed tag in code span Closes #1066.
1 parent 82ac905 commit 1279074

File tree

4 files changed

+158
-0
lines changed

4 files changed

+158
-0
lines changed

docs/change_log/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ Python-Markdown Change Log
55

66
Under development: version 3.3.4 (a bug-fix release).
77

8+
* Properly parse unclosed tags in code spans (#1066).
89
* Properly parse processing instructions in md_in_html (#1070).
910
* Properly parse code spans in md_in_html (#1069).
1011

markdown/extensions/md_in_html.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,26 @@ def handle_empty_tag(self, data, is_block):
206206
else:
207207
self.handle_data(self.md.htmlStash.store(data))
208208

209+
def parse_pi(self, i):
210+
if self.at_line_start() or self.intail or self.mdstack:
211+
# The same override exists in HTMLExtractor without the check
212+
# for mdstack. Therefore, use HTMLExtractor's parent instead.
213+
return super(HTMLExtractor, self).parse_pi(i)
214+
# This is not the beginning of a raw block so treat as plain data
215+
# and avoid consuming any tags which may follow (see #1066).
216+
self.handle_data('<?')
217+
return i + 2
218+
219+
def parse_html_declaration(self, i):
220+
if self.at_line_start() or self.intail or self.mdstack:
221+
# The same override exists in HTMLExtractor without the check
222+
# for mdstack. Therefore, use HTMLExtractor's parent instead.
223+
return super(HTMLExtractor, self).parse_html_declaration(i)
224+
# This is not the beginning of a raw block so treat as plain data
225+
# and avoid consuming any tags which may follow (see #1066).
226+
self.handle_data('<!')
227+
return i + 2
228+
209229

210230
class HtmlBlockPreprocessor(Preprocessor):
211231
"""Remove html blocks from the text and store them for later retrieval."""

markdown/htmlparser.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,22 @@
3939
# so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete,
4040
# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
4141
htmlparser.incomplete = htmlparser.entityref
42+
# Monkeypatch HTMLParser to not accept a backtick in a tag name, attribute name, or bare value.
43+
htmlparser.locatestarttagend_tolerant = re.compile(r"""
44+
<[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
45+
(?:[\s/]* # optional whitespace before attribute name
46+
(?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
47+
(?:\s*=+\s* # value indicator
48+
(?:'[^']*' # LITA-enclosed value
49+
|"[^"]*" # LIT-enclosed value
50+
|(?!['"])[^`>\s]* # bare value <= added backtick here
51+
)
52+
(?:\s*,)* # possibly followed by a comma
53+
)?(?:\s|/(?!>))*
54+
)*
55+
)?
56+
\s* # trailing whitespace
57+
""", re.VERBOSE)
4258

4359
# Match a blank line at the start of a block of text (two newlines).
4460
# The newlines may be preceded by additional whitespace.
@@ -230,6 +246,22 @@ def unknown_decl(self, data):
230246
end = ']]>' if data.startswith('CDATA[') else ']>'
231247
self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
232248

249+
def parse_pi(self, i):
250+
if self.at_line_start() or self.intail:
251+
return super().parse_pi(i)
252+
# This is not the beginning of a raw block so treat as plain data
253+
# and avoid consuming any tags which may follow (see #1066).
254+
self.handle_data('<?')
255+
return i + 2
256+
257+
def parse_html_declaration(self, i):
258+
if self.at_line_start() or self.intail:
259+
return super().parse_html_declaration(i)
260+
# This is not the beginning of a raw block so treat as plain data
261+
# and avoid consuming any tags which may follow (see #1066).
262+
self.handle_data('<!')
263+
return i + 2
264+
233265
# The rest has been copied from base class in standard lib to address #1036.
234266
# As __startag_text is private, all references to it must be in this subclass.
235267
# The last few lines of parse_starttag are reversed so that handle_starttag

tests/test_syntax/blocks/test_html_blocks.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -663,6 +663,48 @@ def test_raw_missing_close_bracket(self):
663663
'<p>&lt;foo</p>'
664664
)
665665

666+
def test_raw_unclosed_tag_in_code_span(self):
667+
self.assertMarkdownRenders(
668+
self.dedent(
669+
"""
670+
`<div`.
671+
672+
<div>
673+
hello
674+
</div>
675+
"""
676+
),
677+
self.dedent(
678+
"""
679+
<p><code>&lt;div</code>.</p>
680+
<div>
681+
hello
682+
</div>
683+
"""
684+
)
685+
)
686+
687+
def test_raw_unclosed_tag_in_code_span_space(self):
688+
self.assertMarkdownRenders(
689+
self.dedent(
690+
"""
691+
` <div `.
692+
693+
<div>
694+
hello
695+
</div>
696+
"""
697+
),
698+
self.dedent(
699+
"""
700+
<p><code>&lt;div</code>.</p>
701+
<div>
702+
hello
703+
</div>
704+
"""
705+
)
706+
)
707+
666708
def test_raw_attributes(self):
667709
self.assertMarkdownRenders(
668710
'<p id="foo", class="bar baz", style="margin: 15px; line-height: 1.5; text-align: center;">text</p>',
@@ -1073,6 +1115,27 @@ def test_raw_processing_instruction_indented(self):
10731115
)
10741116
)
10751117

1118+
def test_raw_processing_instruction_code_span(self):
1119+
self.assertMarkdownRenders(
1120+
self.dedent(
1121+
"""
1122+
`<?php`
1123+
1124+
<div>
1125+
foo
1126+
</div>
1127+
"""
1128+
),
1129+
self.dedent(
1130+
"""
1131+
<p><code>&lt;?php</code></p>
1132+
<div>
1133+
foo
1134+
</div>
1135+
"""
1136+
)
1137+
)
1138+
10761139
def test_raw_declaration_one_line(self):
10771140
self.assertMarkdownRenders(
10781141
'<!DOCTYPE html>',
@@ -1110,6 +1173,27 @@ def test_raw_multiline_declaration(self):
11101173
)
11111174
)
11121175

1176+
def test_raw_declaration_code_span(self):
1177+
self.assertMarkdownRenders(
1178+
self.dedent(
1179+
"""
1180+
`<!`
1181+
1182+
<div>
1183+
foo
1184+
</div>
1185+
"""
1186+
),
1187+
self.dedent(
1188+
"""
1189+
<p><code>&lt;!</code></p>
1190+
<div>
1191+
foo
1192+
</div>
1193+
"""
1194+
)
1195+
)
1196+
11131197
def test_raw_cdata_one_line(self):
11141198
self.assertMarkdownRenders(
11151199
'<![CDATA[ document.write(">"); ]]>',
@@ -1190,6 +1274,27 @@ def test_raw_cdata_indented(self):
11901274
)
11911275
)
11921276

1277+
def test_raw_cdata_code_span(self):
1278+
self.assertMarkdownRenders(
1279+
self.dedent(
1280+
"""
1281+
`<![`
1282+
1283+
<div>
1284+
foo
1285+
</div>
1286+
"""
1287+
),
1288+
self.dedent(
1289+
"""
1290+
<p><code>&lt;![</code></p>
1291+
<div>
1292+
foo
1293+
</div>
1294+
"""
1295+
)
1296+
)
1297+
11931298
def test_charref(self):
11941299
self.assertMarkdownRenders(
11951300
'&sect;',

0 commit comments

Comments
 (0)