diff --git a/src/main/java/org/owasp/html/Encoding.java b/src/main/java/org/owasp/html/Encoding.java index 4a2a601f..20dbed99 100644 --- a/src/main/java/org/owasp/html/Encoding.java +++ b/src/main/java/org/owasp/html/Encoding.java @@ -41,8 +41,21 @@ public final class Encoding { * * @param s text/html * @return text/plain + * @deprecated specify whether s is in an attribute value */ public static String decodeHtml(String s) { + return decodeHtml(s, false); + } + + /** + * Decodes HTML entities to produce a string containing only valid + * Unicode scalar values. + * + * @param s text/html + * @param inAttribute is s in an attribute value? + * @return text/plain + */ + public static String decodeHtml(String s, boolean inAttribute) { int firstAmp = s.indexOf('&'); int safeLimit = longestPrefixOfGoodCodeunits(s); if ((firstAmp & safeLimit) < 0) { return s; } @@ -55,7 +68,7 @@ public static String decodeHtml(String s) { int amp = firstAmp; while (amp >= 0) { sb.append(s, pos, amp); - int end = HtmlEntities.appendDecodedEntity(s, amp, n, sb); + int end = HtmlEntities.appendDecodedEntity(s, amp, n, inAttribute, sb); pos = end; amp = s.indexOf('&', end); } diff --git a/src/main/java/org/owasp/html/HtmlEntities.java b/src/main/java/org/owasp/html/HtmlEntities.java index fc015776..774108cf 100644 --- a/src/main/java/org/owasp/html/HtmlEntities.java +++ b/src/main/java/org/owasp/html/HtmlEntities.java @@ -2307,9 +2307,26 @@ final class HtmlEntities { * in {@code html}. * @param sb string builder to append to. * @return The offset after the end of the decoded sequence in {@code html}. + * @deprecated specify whether html is in an attribute value. */ public static int appendDecodedEntity( - String html, int offset, int limit, StringBuilder sb) { + String html, int offset, int limit, StringBuilder sb) { + return appendDecodedEntity(html, offset, limit, false, sb); + } + + /** + * Decodes any HTML entity at the given location and appends it to a string + * builder. This handles both named and numeric entities. + * + * @param html HTML text. + * @param offset the position of the sequence to decode in {@code html}. + * @param limit the last position that could be part of the sequence to decode + * in {@code html}. + * @param sb string builder to append to. + * @return The offset after the end of the decoded sequence in {@code html}. + */ + public static int appendDecodedEntity( + String html, int offset, int limit, boolean inAttribute, StringBuilder sb) { char ch = html.charAt(offset); if ('&' != ch) { sb.append(ch); @@ -2422,11 +2439,12 @@ public static int appendDecodedEntity( char nameChar = html.charAt(i); t = t.lookup(nameChar); if (t == null) { break; } - if (t.isTerminal()) { + if (t.isTerminal() && mayComplete(inAttribute, html, i, limit)) { longestDecode = t; tail = i + 1; } } + // Try again, case insensitively. if (longestDecode == null) { t = ENTITY_TRIE; for (int i = offset + 1; i < limit; ++i) { @@ -2434,7 +2452,7 @@ public static int appendDecodedEntity( if ('Z' >= nameChar && nameChar >= 'A') { nameChar |= 32; } t = t.lookup(nameChar); if (t == null) { break; } - if (t.isTerminal()) { + if (t.isTerminal() && mayComplete(inAttribute, html, i, limit)) { longestDecode = t; tail = i + 1; } @@ -2456,11 +2474,35 @@ public static int appendDecodedEntity( private static boolean isHtmlIdContinueChar(char ch) { int chLower = ch | 32; - return ('0' <= chLower && chLower <= '9') + return ('0' <= ch && ch <= '9') || ('a' <= chLower && chLower <= 'z') || ('-' == ch); } + /** True if the character at i in html may complete a named character reference */ + private static boolean mayComplete(boolean inAttribute, String html, int i, int limit) { + if (inAttribute && html.charAt(i) != ';' && i + 1 < limit) { + // See if the next character blocks treating this as a full match. + // This avoids problems like "¶" being treated as a decoding in + // + if (continuesCharacterReferenceName(html.charAt(i + 1))) { + return false; + } + } + return true; + } + + /** + * @see comments in issue 254 + */ + private static boolean continuesCharacterReferenceName(char ch) { + int chLower = ch | 32; + return ('0' <= ch && ch <= '9') + || ('a' <= chLower && chLower <= 'z') + || (ch == '='); + } + // /** A possible entity name like "amp" or "gt". */ // public static boolean isEntityName(String name) { // Trie t = ENTITY_TRIE; diff --git a/src/main/java/org/owasp/html/HtmlSanitizer.java b/src/main/java/org/owasp/html/HtmlSanitizer.java index 63d7ae95..d5f8750c 100644 --- a/src/main/java/org/owasp/html/HtmlSanitizer.java +++ b/src/main/java/org/owasp/html/HtmlSanitizer.java @@ -144,7 +144,7 @@ public static void sanitize( switch (token.type) { case TEXT: receiver.text( - Encoding.decodeHtml(htmlContent.substring(token.start, token.end))); + Encoding.decodeHtml(htmlContent.substring(token.start, token.end), false)); break; case UNESCAPED: receiver.text(Encoding.stripBannedCodeunits( @@ -177,8 +177,9 @@ public static void sanitize( htmlContent.substring(tagBodyToken.start, tagBodyToken.end))); break; case ATTRVALUE: - attrs.add(Encoding.decodeHtml(stripQuotes( - htmlContent.substring(tagBodyToken.start, tagBodyToken.end)))); + String attributeContentRaw = + stripQuotes(htmlContent.substring(tagBodyToken.start, tagBodyToken.end)); + attrs.add(Encoding.decodeHtml(attributeContentRaw, true)); attrsReadyForName = true; break; case TAGEND: diff --git a/src/test/java/org/owasp/html/EncodingTest.java b/src/test/java/org/owasp/html/EncodingTest.java index eea7769a..54ff0b45 100644 --- a/src/test/java/org/owasp/html/EncodingTest.java +++ b/src/test/java/org/owasp/html/EncodingTest.java @@ -34,6 +34,24 @@ @SuppressWarnings("javadoc") public final class EncodingTest extends TestCase { + private static void assertDecodedHtml(String want, String inputHtml) { + assertDecodedHtml(want, want, inputHtml); + } + + private static void assertDecodedHtml( + String wantText, String wantAttr, String inputHtml + ) { + assertEquals( + "!inAttribute: " + inputHtml, + wantText, + Encoding.decodeHtml(inputHtml, false) + ); + assertEquals( + "inAttribute: " + inputHtml, + wantAttr, + Encoding.decodeHtml(inputHtml, true) + ); + } @Test public static final void testDecodeHtml() { @@ -43,170 +61,90 @@ public static final void testDecodeHtml() { // 123456789012345678901234567890123456789012345678901234567890123456789 String golden = "The quick\u00a0brown fox\njumps over\r\nthe lazy dog\n"; - assertEquals(golden, Encoding.decodeHtml(html)); + assertDecodedHtml(golden, html); // Don't allocate a new string when no entities. - assertSame(golden, Encoding.decodeHtml(golden)); + assertSame(golden, golden); // test interrupted escapes and escapes at end of file handled gracefully - assertEquals( - "\\\\u000a", - Encoding.decodeHtml("\\\\u000a")); - assertEquals( - "\n", - Encoding.decodeHtml(" ")); - assertEquals( - "\n", - Encoding.decodeHtml(" ")); - assertEquals( - "\n", - Encoding.decodeHtml(" ")); - assertEquals( - "\n", - Encoding.decodeHtml(" ")); - assertEquals( + assertDecodedHtml("\\\\u000a", "\\\\u000a"); + assertDecodedHtml("\n", " "); + assertDecodedHtml("\n", " "); + assertDecodedHtml("\n", " "); + assertDecodedHtml("\n", " "); + assertDecodedHtml( String.valueOf(Character.toChars(0x10000)), - Encoding.decodeHtml("𐀀")); - assertEquals( - "\n", - Encoding.decodeHtml(" ")); - assertEquals( - "�ziggy", - Encoding.decodeHtml("�ziggy")); - assertEquals( - "਀z;", - Encoding.decodeHtml("਀z;")); - assertEquals( - "&#\n", - Encoding.decodeHtml("&# ")); - assertEquals( - "&#x\n", - Encoding.decodeHtml("&#x ")); - assertEquals( - "\n\n", - Encoding.decodeHtml(" ")); - assertEquals( - "&#\n", - Encoding.decodeHtml("&# ")); - assertEquals( - "&#x", - Encoding.decodeHtml("&#x")); - assertEquals( - "", // NUL elided. - Encoding.decodeHtml("�")); - assertEquals( - "&#", - Encoding.decodeHtml("&#")); - - assertEquals( - "\\", - Encoding.decodeHtml("\\")); - assertEquals( - "&", - Encoding.decodeHtml("&")); - - assertEquals( - "�a;", - Encoding.decodeHtml("�a;")); - assertEquals( - "\n", - Encoding.decodeHtml(" ")); - assertEquals( - "\n", - Encoding.decodeHtml(" ")); - assertEquals( - "\n", - Encoding.decodeHtml(" ")); - assertEquals( - "\t", - Encoding.decodeHtml(" ")); - assertEquals( - "\n", - Encoding.decodeHtml(" ")); - assertEquals( - "�ziggy", - Encoding.decodeHtml("�ziggy")); - assertEquals( - "&#\n", - Encoding.decodeHtml("&# ")); - assertEquals( - "\n", - Encoding.decodeHtml("� ")); - assertEquals( - "\n", - Encoding.decodeHtml(" ")); - assertEquals( - "&#\n", - Encoding.decodeHtml("&# ")); - assertEquals( - "", // Invalid XML char elided. - Encoding.decodeHtml("")); - assertEquals( - "\t", - Encoding.decodeHtml(" ")); - assertEquals( - "\n", - Encoding.decodeHtml(" ")); + "𐀀" + ); + assertDecodedHtml("\n", " "); + assertDecodedHtml("�ziggy", "�ziggy"); + assertDecodedHtml("਀z;", "਀z;"); + assertDecodedHtml("&#\n", "&# "); + assertDecodedHtml("&#x\n", "&#x "); + assertDecodedHtml("\n\n", " "); + assertDecodedHtml("&#\n", "&# "); + assertDecodedHtml("&#x", "&#x"); + assertDecodedHtml("", "�"); // NUL elided. + assertDecodedHtml("&#", "&#"); + + assertDecodedHtml("\\", "\\"); + assertDecodedHtml("&", "&"); + + assertDecodedHtml("�a;", "�a;"); + assertDecodedHtml("\n", " "); + assertDecodedHtml("\n", " "); + assertDecodedHtml("\n", " "); + assertDecodedHtml("\t", " "); + assertDecodedHtml("\n", " "); + assertDecodedHtml("�ziggy", "�ziggy"); + assertDecodedHtml("&#\n", "&# "); + assertDecodedHtml("\n", "� "); + assertDecodedHtml("\n", " "); + assertDecodedHtml("&#\n", "&# "); + assertDecodedHtml("", ""); // Invalid XML char elided. + assertDecodedHtml("\t", " "); + assertDecodedHtml("\n", " "); // test the named escapes - assertEquals( - "<", - Encoding.decodeHtml("<")); - assertEquals( - ">", - Encoding.decodeHtml(">")); - assertEquals( - "\"", - Encoding.decodeHtml(""")); - assertEquals( - "'", - Encoding.decodeHtml("'")); - assertEquals( - "'", - Encoding.decodeHtml("'")); - assertEquals( - "'", - Encoding.decodeHtml("'")); - assertEquals( - "&", - Encoding.decodeHtml("&")); - assertEquals( - "<", - Encoding.decodeHtml("&lt;")); - assertEquals( - "&", - Encoding.decodeHtml("&")); - assertEquals( - "&", - Encoding.decodeHtml("&")); - assertEquals( - "&", - Encoding.decodeHtml("&AmP;")); - assertEquals( - "\u0391", - Encoding.decodeHtml("Α")); - assertEquals( - "\u03b1", - Encoding.decodeHtml("α")); - assertEquals( - "\ud835\udc9c", // U+1D49C requires a surrogate pair in UTF-16. - Encoding.decodeHtml("𝒜")); - assertEquals( - "fj", // fj refers to 2 characters. - Encoding.decodeHtml("fj")); - assertEquals( - "\u2233", // HTML entity with the longest name. - Encoding.decodeHtml("∳")); - assertEquals( // Missing the semicolon. - "&CounterClockwiseContourIntegral", - Encoding.decodeHtml("&CounterClockwiseContourIntegral")); - - assertEquals( - "&;", - Encoding.decodeHtml("&;")); - assertEquals( - "&bogus;", - Encoding.decodeHtml("&bogus;")); + assertDecodedHtml("<", "<"); + assertDecodedHtml(">", ">"); + assertDecodedHtml("\"", """); + assertDecodedHtml("'", "'"); + assertDecodedHtml("'", "'"); + assertDecodedHtml("'", "'"); + assertDecodedHtml("&", "&"); + assertDecodedHtml("<", "&lt;"); + assertDecodedHtml("&", "&"); + assertDecodedHtml("&", "&"); + assertDecodedHtml("&", "&AmP;"); + assertDecodedHtml("\u0391", "Α"); + assertDecodedHtml("\u03b1", "α"); + // U+1D49C requires a surrogate pair in UTF-16. + assertDecodedHtml("\ud835\udc9c", "𝒜"); + // fj refers to 2 characters. + assertDecodedHtml("fj", "fj"); + // HTML entity with the longest name. + assertDecodedHtml("\u2233", "∳"); + // Missing the semicolon. + assertDecodedHtml( + "&CounterClockwiseContourIntegral", + "&CounterClockwiseContourIntegral" + ); + + assertDecodedHtml("&;", "&;"); + assertDecodedHtml("&bogus;", "&bogus;"); + + // Some strings decode differently depending on whether or not they're in an HTML attribute. + assertDecodedHtml( + "?foo\u00B6m=bar", + "?foo¶m=bar", + "?foo¶m=bar" + ); + assertDecodedHtml( + "?foo\u00B6=bar", + "?foo¶=bar", + "?foo¶=bar" + ); } @Test diff --git a/src/test/java/org/owasp/html/HtmlSanitizerTest.java b/src/test/java/org/owasp/html/HtmlSanitizerTest.java index 53ff9270..d1a73e86 100644 --- a/src/test/java/org/owasp/html/HtmlSanitizerTest.java +++ b/src/test/java/org/owasp/html/HtmlSanitizerTest.java @@ -440,6 +440,13 @@ public static final void testMacOSAndIOSQueryOfDeath() { } } + @Test + public static final void testIssue254SemicolonlessNamedCharactersInUrls() { + String input = "click me"; + String want = "click me"; + assertEquals(want, sanitize(input)); + } + private static String sanitize(@Nullable String html) { StringBuilder sb = new StringBuilder(); HtmlStreamRenderer renderer = HtmlStreamRenderer.create(