diff --git a/src/main/java/org/owasp/html/Encoding.java b/src/main/java/org/owasp/html/Encoding.java
index 4a2a601f..20dbed99 100644
--- a/src/main/java/org/owasp/html/Encoding.java
+++ b/src/main/java/org/owasp/html/Encoding.java
@@ -41,8 +41,21 @@ public final class Encoding {
*
* @param s text/html
* @return text/plain
+ * @deprecated specify whether s is in an attribute value
*/
public static String decodeHtml(String s) {
+ return decodeHtml(s, false);
+ }
+
+ /**
+ * Decodes HTML entities to produce a string containing only valid
+ * Unicode scalar values.
+ *
+ * @param s text/html
+ * @param inAttribute is s in an attribute value?
+ * @return text/plain
+ */
+ public static String decodeHtml(String s, boolean inAttribute) {
int firstAmp = s.indexOf('&');
int safeLimit = longestPrefixOfGoodCodeunits(s);
if ((firstAmp & safeLimit) < 0) { return s; }
@@ -55,7 +68,7 @@ public static String decodeHtml(String s) {
int amp = firstAmp;
while (amp >= 0) {
sb.append(s, pos, amp);
- int end = HtmlEntities.appendDecodedEntity(s, amp, n, sb);
+ int end = HtmlEntities.appendDecodedEntity(s, amp, n, inAttribute, sb);
pos = end;
amp = s.indexOf('&', end);
}
diff --git a/src/main/java/org/owasp/html/HtmlEntities.java b/src/main/java/org/owasp/html/HtmlEntities.java
index fc015776..774108cf 100644
--- a/src/main/java/org/owasp/html/HtmlEntities.java
+++ b/src/main/java/org/owasp/html/HtmlEntities.java
@@ -2307,9 +2307,26 @@ final class HtmlEntities {
* in {@code html}.
* @param sb string builder to append to.
* @return The offset after the end of the decoded sequence in {@code html}.
+ * @deprecated specify whether html is in an attribute value.
*/
public static int appendDecodedEntity(
- String html, int offset, int limit, StringBuilder sb) {
+ String html, int offset, int limit, StringBuilder sb) {
+ return appendDecodedEntity(html, offset, limit, false, sb);
+ }
+
+ /**
+ * Decodes any HTML entity at the given location and appends it to a string
+ * builder. This handles both named and numeric entities.
+ *
+ * @param html HTML text.
+ * @param offset the position of the sequence to decode in {@code html}.
+ * @param limit the last position that could be part of the sequence to decode
+ * in {@code html}.
+ * @param sb string builder to append to.
+ * @return The offset after the end of the decoded sequence in {@code html}.
+ */
+ public static int appendDecodedEntity(
+ String html, int offset, int limit, boolean inAttribute, StringBuilder sb) {
char ch = html.charAt(offset);
if ('&' != ch) {
sb.append(ch);
@@ -2422,11 +2439,12 @@ public static int appendDecodedEntity(
char nameChar = html.charAt(i);
t = t.lookup(nameChar);
if (t == null) { break; }
- if (t.isTerminal()) {
+ if (t.isTerminal() && mayComplete(inAttribute, html, i, limit)) {
longestDecode = t;
tail = i + 1;
}
}
+ // Try again, case insensitively.
if (longestDecode == null) {
t = ENTITY_TRIE;
for (int i = offset + 1; i < limit; ++i) {
@@ -2434,7 +2452,7 @@ public static int appendDecodedEntity(
if ('Z' >= nameChar && nameChar >= 'A') { nameChar |= 32; }
t = t.lookup(nameChar);
if (t == null) { break; }
- if (t.isTerminal()) {
+ if (t.isTerminal() && mayComplete(inAttribute, html, i, limit)) {
longestDecode = t;
tail = i + 1;
}
@@ -2456,11 +2474,35 @@ public static int appendDecodedEntity(
private static boolean isHtmlIdContinueChar(char ch) {
int chLower = ch | 32;
- return ('0' <= chLower && chLower <= '9')
+ return ('0' <= ch && ch <= '9')
|| ('a' <= chLower && chLower <= 'z')
|| ('-' == ch);
}
+ /** True if the character at i in html may complete a named character reference */
+ private static boolean mayComplete(boolean inAttribute, String html, int i, int limit) {
+ if (inAttribute && html.charAt(i) != ';' && i + 1 < limit) {
+ // See if the next character blocks treating this as a full match.
+ // This avoids problems like "¶" being treated as a decoding in
+ //
+ if (continuesCharacterReferenceName(html.charAt(i + 1))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * @see comments in issue 254
+ */
+ private static boolean continuesCharacterReferenceName(char ch) {
+ int chLower = ch | 32;
+ return ('0' <= ch && ch <= '9')
+ || ('a' <= chLower && chLower <= 'z')
+ || (ch == '=');
+ }
+
// /** A possible entity name like "amp" or "gt". */
// public static boolean isEntityName(String name) {
// Trie t = ENTITY_TRIE;
diff --git a/src/main/java/org/owasp/html/HtmlSanitizer.java b/src/main/java/org/owasp/html/HtmlSanitizer.java
index 63d7ae95..d5f8750c 100644
--- a/src/main/java/org/owasp/html/HtmlSanitizer.java
+++ b/src/main/java/org/owasp/html/HtmlSanitizer.java
@@ -144,7 +144,7 @@ public static void sanitize(
switch (token.type) {
case TEXT:
receiver.text(
- Encoding.decodeHtml(htmlContent.substring(token.start, token.end)));
+ Encoding.decodeHtml(htmlContent.substring(token.start, token.end), false));
break;
case UNESCAPED:
receiver.text(Encoding.stripBannedCodeunits(
@@ -177,8 +177,9 @@ public static void sanitize(
htmlContent.substring(tagBodyToken.start, tagBodyToken.end)));
break;
case ATTRVALUE:
- attrs.add(Encoding.decodeHtml(stripQuotes(
- htmlContent.substring(tagBodyToken.start, tagBodyToken.end))));
+ String attributeContentRaw =
+ stripQuotes(htmlContent.substring(tagBodyToken.start, tagBodyToken.end));
+ attrs.add(Encoding.decodeHtml(attributeContentRaw, true));
attrsReadyForName = true;
break;
case TAGEND:
diff --git a/src/test/java/org/owasp/html/EncodingTest.java b/src/test/java/org/owasp/html/EncodingTest.java
index eea7769a..54ff0b45 100644
--- a/src/test/java/org/owasp/html/EncodingTest.java
+++ b/src/test/java/org/owasp/html/EncodingTest.java
@@ -34,6 +34,24 @@
@SuppressWarnings("javadoc")
public final class EncodingTest extends TestCase {
+ private static void assertDecodedHtml(String want, String inputHtml) {
+ assertDecodedHtml(want, want, inputHtml);
+ }
+
+ private static void assertDecodedHtml(
+ String wantText, String wantAttr, String inputHtml
+ ) {
+ assertEquals(
+ "!inAttribute: " + inputHtml,
+ wantText,
+ Encoding.decodeHtml(inputHtml, false)
+ );
+ assertEquals(
+ "inAttribute: " + inputHtml,
+ wantAttr,
+ Encoding.decodeHtml(inputHtml, true)
+ );
+ }
@Test
public static final void testDecodeHtml() {
@@ -43,170 +61,90 @@ public static final void testDecodeHtml() {
// 123456789012345678901234567890123456789012345678901234567890123456789
String golden =
"The quick\u00a0brown fox\njumps over\r\nthe lazy dog\n";
- assertEquals(golden, Encoding.decodeHtml(html));
+ assertDecodedHtml(golden, html);
// Don't allocate a new string when no entities.
- assertSame(golden, Encoding.decodeHtml(golden));
+ assertSame(golden, golden);
// test interrupted escapes and escapes at end of file handled gracefully
- assertEquals(
- "\\\\u000a",
- Encoding.decodeHtml("\\\\u000a"));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
+ assertDecodedHtml("\\\\u000a", "\\\\u000a");
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml(
String.valueOf(Character.toChars(0x10000)),
- Encoding.decodeHtml("𐀀"));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "ziggy",
- Encoding.decodeHtml("ziggy"));
- assertEquals(
- "z;",
- Encoding.decodeHtml("z;"));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "\n\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "",
- Encoding.decodeHtml(""));
- assertEquals(
- "", // NUL elided.
- Encoding.decodeHtml(""));
- assertEquals(
- "",
- Encoding.decodeHtml(""));
-
- assertEquals(
- "\\",
- Encoding.decodeHtml("\\"));
- assertEquals(
- "&",
- Encoding.decodeHtml("&"));
-
- assertEquals(
- "a;",
- Encoding.decodeHtml("a;"));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "\t",
- Encoding.decodeHtml(" "));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "ziggy",
- Encoding.decodeHtml("ziggy"));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
- assertEquals(
- "", // Invalid XML char elided.
- Encoding.decodeHtml(""));
- assertEquals(
- "\t",
- Encoding.decodeHtml(" "));
- assertEquals(
- "\n",
- Encoding.decodeHtml("
"));
+ "𐀀"
+ );
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml("ziggy", "ziggy");
+ assertDecodedHtml("z;", "z;");
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml("\n\n", "
");
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml("", "");
+ assertDecodedHtml("", ""); // NUL elided.
+ assertDecodedHtml("", "");
+
+ assertDecodedHtml("\\", "\\");
+ assertDecodedHtml("&", "&");
+
+ assertDecodedHtml("a;", "a;");
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml("\t", " ");
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml("ziggy", "ziggy");
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml("\n", "
");
+ assertDecodedHtml("", ""); // Invalid XML char elided.
+ assertDecodedHtml("\t", " ");
+ assertDecodedHtml("\n", "
");
// test the named escapes
- assertEquals(
- "<",
- Encoding.decodeHtml("<"));
- assertEquals(
- ">",
- Encoding.decodeHtml(">"));
- assertEquals(
- "\"",
- Encoding.decodeHtml("""));
- assertEquals(
- "'",
- Encoding.decodeHtml("'"));
- assertEquals(
- "'",
- Encoding.decodeHtml("'"));
- assertEquals(
- "'",
- Encoding.decodeHtml("'"));
- assertEquals(
- "&",
- Encoding.decodeHtml("&"));
- assertEquals(
- "<",
- Encoding.decodeHtml("<"));
- assertEquals(
- "&",
- Encoding.decodeHtml("&"));
- assertEquals(
- "&",
- Encoding.decodeHtml("&"));
- assertEquals(
- "&",
- Encoding.decodeHtml("&AmP;"));
- assertEquals(
- "\u0391",
- Encoding.decodeHtml("Α"));
- assertEquals(
- "\u03b1",
- Encoding.decodeHtml("α"));
- assertEquals(
- "\ud835\udc9c", // U+1D49C requires a surrogate pair in UTF-16.
- Encoding.decodeHtml("𝒜"));
- assertEquals(
- "fj", // fj refers to 2 characters.
- Encoding.decodeHtml("fj"));
- assertEquals(
- "\u2233", // HTML entity with the longest name.
- Encoding.decodeHtml("∳"));
- assertEquals( // Missing the semicolon.
- "&CounterClockwiseContourIntegral",
- Encoding.decodeHtml("&CounterClockwiseContourIntegral"));
-
- assertEquals(
- "&;",
- Encoding.decodeHtml("&;"));
- assertEquals(
- "&bogus;",
- Encoding.decodeHtml("&bogus;"));
+ assertDecodedHtml("<", "<");
+ assertDecodedHtml(">", ">");
+ assertDecodedHtml("\"", """);
+ assertDecodedHtml("'", "'");
+ assertDecodedHtml("'", "'");
+ assertDecodedHtml("'", "'");
+ assertDecodedHtml("&", "&");
+ assertDecodedHtml("<", "<");
+ assertDecodedHtml("&", "&");
+ assertDecodedHtml("&", "&");
+ assertDecodedHtml("&", "&AmP;");
+ assertDecodedHtml("\u0391", "Α");
+ assertDecodedHtml("\u03b1", "α");
+ // U+1D49C requires a surrogate pair in UTF-16.
+ assertDecodedHtml("\ud835\udc9c", "𝒜");
+ // fj refers to 2 characters.
+ assertDecodedHtml("fj", "fj");
+ // HTML entity with the longest name.
+ assertDecodedHtml("\u2233", "∳");
+ // Missing the semicolon.
+ assertDecodedHtml(
+ "&CounterClockwiseContourIntegral",
+ "&CounterClockwiseContourIntegral"
+ );
+
+ assertDecodedHtml("&;", "&;");
+ assertDecodedHtml("&bogus;", "&bogus;");
+
+ // Some strings decode differently depending on whether or not they're in an HTML attribute.
+ assertDecodedHtml(
+ "?foo\u00B6m=bar",
+ "?foo¶m=bar",
+ "?foo¶m=bar"
+ );
+ assertDecodedHtml(
+ "?foo\u00B6=bar",
+ "?foo¶=bar",
+ "?foo¶=bar"
+ );
}
@Test
diff --git a/src/test/java/org/owasp/html/HtmlSanitizerTest.java b/src/test/java/org/owasp/html/HtmlSanitizerTest.java
index 53ff9270..d1a73e86 100644
--- a/src/test/java/org/owasp/html/HtmlSanitizerTest.java
+++ b/src/test/java/org/owasp/html/HtmlSanitizerTest.java
@@ -440,6 +440,13 @@ public static final void testMacOSAndIOSQueryOfDeath() {
}
}
+ @Test
+ public static final void testIssue254SemicolonlessNamedCharactersInUrls() {
+ String input = "click me";
+ String want = "click me";
+ assertEquals(want, sanitize(input));
+ }
+
private static String sanitize(@Nullable String html) {
StringBuilder sb = new StringBuilder();
HtmlStreamRenderer renderer = HtmlStreamRenderer.create(