Skip to content

Commit 241b4b8

Browse files
committed
Modernize handling of <!--...--> in CDATA
"Escaping text spans" were part of early drafts of the HTML5 spec but were removed in favor of special handling in CSS (see [CDO/CDC][1]) and [EcmaScript][2] grammar. This change * makes parsing consistent with recent versions of HTML 5 * updates rendering to be consistent with documented [script content restrictions][3] The script content restrictions are also applied to other CDATA tags like `<style>`. See also [issue #153][] [1]: https://www.w3.org/TR/CSS2/grammar.html#scanner [2]: http://www.ecma-international.org/ecma-262/6.0/#sec-html-like-comments [3]: https://www.w3.org/TR/html5/scripting-1.html#restrictions-for-contents-of-script-elements [issue #153]: #153
1 parent bb43971 commit 241b4b8

File tree

6 files changed

+250
-144
lines changed

6 files changed

+250
-144
lines changed

src/main/java/org/owasp/html/HtmlLexer.java

Lines changed: 0 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -348,36 +348,6 @@ private static enum State {
348348
BOGUS_COMMENT,
349349
SERVER_CODE,
350350
SERVER_CODE_PCT,
351-
352-
// From HTML 5 section 8.1.2.6
353-
354-
// The text in CDATA and RCDATA elements must not contain any
355-
// occurrences of the string "</" followed by characters that
356-
// case-insensitively match the tag name of the element followed
357-
// by one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
358-
// U+000B LINE TABULATION, U+000C FORM FEED (FF), U+0020 SPACE,
359-
// U+003E GREATER-THAN SIGN (>), or U+002F SOLIDUS (/), unless
360-
// that string is part of an escaping text span.
361-
362-
// An escaping text span is a span of text (in CDATA and RCDATA
363-
// elements) and character entity references (in RCDATA elements)
364-
// that starts with an escaping text span start that is not itself
365-
// in an escaping text span, and ends at the next escaping text
366-
// span end.
367-
368-
// An escaping text span start is a part of text that consists of
369-
// the four character sequence "<!--".
370-
371-
// An escaping text span end is a part of text that consists of
372-
// the three character sequence "-->".
373-
374-
// An escaping text span start may share its U+002D HYPHEN-MINUS characters
375-
// with its corresponding escaping text span end.
376-
UNESCAPED_LT_BANG, // <!
377-
UNESCAPED_LT_BANG_DASH, // <!-
378-
ESCAPING_TEXT_SPAN, // Inside an escaping text span
379-
ESCAPING_TEXT_SPAN_DASH, // Seen - inside an escaping text span
380-
ESCAPING_TEXT_SPAN_DASH_DASH, // Seen -- inside an escaping text span
381351
;
382352
}
383353

@@ -471,16 +441,6 @@ private HtmlToken parseToken() {
471441
case '!': // Comment or declaration
472442
if (!this.inEscapeExemptBlock) {
473443
state = State.BANG;
474-
} else if (HtmlTextEscapingMode.allowsEscapingTextSpan(
475-
escapeExemptTagName)) {
476-
// Directives, and cdata suppressed in escape
477-
// exempt mode as they could obscure the close of the
478-
// escape exempty block, but comments are similar to escaping
479-
// text spans, and are significant in all CDATA and RCDATA
480-
// blocks except those inside <xmp> tags.
481-
// See "Escaping text spans" in section 8.1.2.6 of HTML5.
482-
// http://www.w3.org/html/wg/html5/#cdata-rcdata-restrictions
483-
state = State.UNESCAPED_LT_BANG;
484444
}
485445
++end;
486446
break;
@@ -603,47 +563,6 @@ && canonicalName(start + 2, end)
603563
state = State.SERVER_CODE;
604564
}
605565
break;
606-
case UNESCAPED_LT_BANG:
607-
if ('-' == ch) {
608-
state = State.UNESCAPED_LT_BANG_DASH;
609-
} else {
610-
type = HtmlTokenType.TEXT;
611-
state = State.DONE;
612-
}
613-
break;
614-
case UNESCAPED_LT_BANG_DASH:
615-
if ('-' == ch) {
616-
// According to HTML 5 section 8.1.2.6
617-
618-
// An escaping text span start may share its
619-
// U+002D HYPHEN-MINUS characters with its
620-
// corresponding escaping text span end.
621-
state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
622-
} else {
623-
type = HtmlTokenType.TEXT;
624-
state = State.DONE;
625-
}
626-
break;
627-
case ESCAPING_TEXT_SPAN:
628-
if ('-' == ch) {
629-
state = State.ESCAPING_TEXT_SPAN_DASH;
630-
}
631-
break;
632-
case ESCAPING_TEXT_SPAN_DASH:
633-
if ('-' == ch) {
634-
state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
635-
} else {
636-
state = State.ESCAPING_TEXT_SPAN;
637-
}
638-
break;
639-
case ESCAPING_TEXT_SPAN_DASH_DASH:
640-
if ('>' == ch) {
641-
type = HtmlTokenType.TEXT;
642-
state = State.DONE;
643-
} else if ('-' != ch) {
644-
state = State.ESCAPING_TEXT_SPAN;
645-
}
646-
break;
647566
case DONE:
648567
throw new AssertionError(
649568
"Unexpectedly DONE while lexing HTML token stream");

src/main/java/org/owasp/html/HtmlStreamRenderer.java

Lines changed: 55 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -288,67 +288,65 @@ private final void writeText(String text) throws IOException {
288288

289289
private static int checkHtmlCdataCloseable(
290290
String localName, StringBuilder sb) {
291-
int escapingTextSpanStart = -1;
291+
// www.w3.org/TR/html51/semantics-scripting.html#restrictions-for-contents-of-script-elements
292+
// www.w3.org/TR/html5/scripting-1.html#restrictions-for-contents-of-script-elements
293+
// 4.12.1.3. Restrictions for contents of script elements
294+
// The textContent of a script element must match the script production in the following ABNF, the character set for which is Unicode. [ABNF]
295+
//
296+
// script = outer *( comment-open inner comment-close outer )
297+
//
298+
// outer = < any string that doesn’t contain a substring that matches not-in-outer >
299+
// not-in-outer = comment-open
300+
// inner = < any string that doesn’t contain a substring that matches not-in-inner >
301+
// not-in-inner = comment-close / script-open
302+
//
303+
// comment-open = "<!--"
304+
// comment-close = "-->"
305+
// script-open = "<" s c r i p t tag-end
306+
307+
// We apply the above restrictions to all CDATA (modulo local name).
308+
int innerStart = -1;
292309
for (int i = 0, n = sb.length(); i < n; ++i) {
293310
char ch = sb.charAt(i);
294311
switch (ch) {
295312
case '<':
296-
if (i + 3 < n
297-
&& '!' == sb.charAt(i + 1)
298-
&& '-' == sb.charAt(i + 2)
299-
&& '-' == sb.charAt(i + 3)) {
300-
if (escapingTextSpanStart == -1) {
301-
escapingTextSpanStart = i;
302-
} else {
303-
return i;
313+
if (i + 3 < n && sb.charAt(i + 1) == '!') {
314+
if (sb.charAt(i + 2) == '-'
315+
&& sb.charAt(i + 3) == '-') {
316+
if (innerStart >= 0) { return i; } // Nesting
317+
innerStart = i;
304318
}
305-
} else if (i + 1 + localName.length() < n
306-
&& '/' == sb.charAt(i + 1)
307-
&& Strings.regionMatchesIgnoreCase(
308-
sb, i + 2, localName, 0, localName.length())) {
309-
// A close tag contained in the content.
310-
if (escapingTextSpanStart < 0) {
311-
// We could try some recovery strategies here.
312-
// E.g. prepending "/<!--\n" to sb if "script".equals(localName)
313-
return i;
319+
} else { // Look for embedded <script or </script
320+
int start = i + 1;
321+
if (start + 1 < n && sb.charAt(start) == '/') {
322+
++start;
323+
} else if (innerStart < 0) {
324+
break;
314325
}
315-
if (!"script".equals(localName)) {
316-
// Script tags are commonly included inside script tags.
317-
// <script><!--document.write('<script>f()</script>');--></script>
318-
// but this does not happen in other CDATA element types.
319-
// Actually allowing an end tag inside others is problematic.
320-
// Specifically,
321-
// <style><!--</style>-->/* foo */</style>
322-
// displays the text "/* foo */" on some browsers.
326+
// We don't need to do any suffix checks to preserve concatenation safety
327+
// since we buffer pending unescaped above.
328+
int end = start + localName.length();
329+
if (end <= n
330+
&& Strings.regionMatchesIgnoreCase(
331+
sb, start, localName, 0, end - start)
332+
&& (end == n || isTagEnd(sb.charAt(end)))) {
323333
return i;
324334
}
325335
}
326336
break;
327337
case '>':
328-
// From the HTML5 spec:
329-
// The text in style, script, title, and textarea elements must not
330-
// have an escaping text span start that is not followed by an
331-
// escaping text span end.
332-
// We look left since the HTML 5 spec allows the escaping text span
333-
// end to share dashes with the start.
334-
if (i >= 2 && '-' == sb.charAt(i - 1) && '-' == sb.charAt(i - 2)) {
335-
if (escapingTextSpanStart < 0) { return i - 2; }
336-
escapingTextSpanStart = -1;
338+
if (i >= 2 && sb.charAt(i - 2) == '-' && sb.charAt(i - 2) == '-') {
339+
if (innerStart < 0) { return i; }
340+
// Merged start and end like <!--->
341+
if (innerStart + 6 >= i) { return i; }
342+
innerStart = -1;
337343
}
338344
break;
339-
default:
340-
break;
341345
}
342346
}
343-
if (escapingTextSpanStart >= 0) {
344-
// We could try recovery strategies here.
345-
// E.g. appending "//-->" to the buffer if "script".equals(localName)
346-
return escapingTextSpanStart;
347-
}
348-
return -1;
347+
return innerStart;
349348
}
350349

351-
352350
@VisibleForTesting
353351
static boolean isValidHtmlName(String name) {
354352
int n = name.length();
@@ -421,4 +419,17 @@ public void close() throws IOException {
421419
closeable.close();
422420
}
423421
}
422+
423+
private static final long TAG_ENDS = 0L
424+
| (1L << '\t')
425+
| (1L << '\n')
426+
| (1L << '\f')
427+
| (1L << '\r')
428+
| (1L << ' ')
429+
| (1L << '/')
430+
| (1L << '>');
431+
432+
private static boolean isTagEnd(char ch) {
433+
return ch < 63 && 0 != (TAG_ENDS & (1L << ch));
434+
}
424435
}

src/main/java/org/owasp/html/HtmlTextEscapingMode.java

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -143,18 +143,6 @@ public static HtmlTextEscapingMode getModeForTag(String canonTagName) {
143143
return mode != null ? mode : PCDATA;
144144
}
145145

146-
/**
147-
* True iff the content following the given tag allows escaping text
148-
* spans: {@code <!--&hellip;-->} that escape even things that might
149-
* be an end tag for the corresponding open tag.
150-
*/
151-
public static boolean allowsEscapingTextSpan(String canonTagName) {
152-
// <xmp> and <plaintext> do not admit escaping text spans.
153-
return "style".equals(canonTagName) || "script".equals(canonTagName)
154-
|| "noembed".equals(canonTagName) || "noscript".equals(canonTagName)
155-
|| "noframes".equals(canonTagName);
156-
}
157-
158146
/**
159147
* True if content immediately following the start tag must be treated as
160148
* special CDATA so that &lt;'s are not treated as starting tags, comments

src/test/java/org/owasp/html/HtmlPolicyBuilderTest.java

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import org.junit.Test;
3535

3636
import com.google.common.base.Joiner;
37+
import com.google.common.collect.ImmutableSet;
3738

3839
import junit.framework.TestCase;
3940

@@ -175,6 +176,18 @@ public static final void testLinksWithNofollow() {
175176
.requireRelNofollowOnLinks()));
176177
}
177178

179+
@Test
180+
public static final void testLinksWithNofollowAlreadyPresent() {
181+
assertEquals(
182+
"html <a href=\"/\" rel=\"nofollow\">link</a>",
183+
apply(
184+
new HtmlPolicyBuilder()
185+
.allowElements("a")
186+
.allowAttributes("href").onElements("a")
187+
.requireRelNofollowOnLinks(),
188+
"html <a href='/' rel='nofollow'>link</a>"));
189+
}
190+
178191
@Test
179192
public static final void testImagesAllowed() {
180193
assertEquals(
@@ -763,6 +776,47 @@ public static final void testDirLi() {
763776
"<dir compact=\"compact\"><li>something</li></dir>"));
764777
}
765778

779+
@Test
780+
public static void testScriptTagWithCommentBlockContainingHtmlCommentEnd() {
781+
PolicyFactory scriptSanitizer = new HtmlPolicyBuilder()
782+
// allow scripts of type application/json
783+
.allowElements(
784+
new ElementPolicy() {
785+
public String apply(String elementName, List<String> attrs) {
786+
int typeIndex = attrs.indexOf("type");
787+
if (typeIndex < 0 || attrs.size() < typeIndex + 1
788+
|| !attrs.get(typeIndex + 1).equals("application/json")) {
789+
return null;
790+
}
791+
return elementName;
792+
}
793+
},
794+
"script")
795+
// allow contents in this script tag
796+
.allowTextIn("script")
797+
// keep type attribute in application/json script tag
798+
.allowAttributes("type").matching(true, ImmutableSet.of("application/json")).onElements("script")
799+
.toFactory();
800+
801+
String mismatchedHtmlComments = "<script type=\"application/json\">\n" +
802+
"<!--\n" +
803+
"{\"field\":\"-->\"}\n" +
804+
"// -->\n" +
805+
"</script>";
806+
assertEquals(
807+
"<script type=\"application/json\"></script>",
808+
scriptSanitizer.sanitize(mismatchedHtmlComments));
809+
810+
String htmlMetaCharsEscaped = "<script type=\"application/json\">\n" +
811+
"<!--\n" +
812+
"{\"field\":\"--\\u003c\"}\n" +
813+
"// -->\n" +
814+
"</script>";
815+
assertEquals(
816+
htmlMetaCharsEscaped,
817+
scriptSanitizer.sanitize(htmlMetaCharsEscaped));
818+
}
819+
766820
private static String apply(HtmlPolicyBuilder b) {
767821
return apply(b, EXAMPLE);
768822
}

0 commit comments

Comments
 (0)