Skip to content

Commit 5372c74

Browse files
authored
Decode attribute content differently from text node content (#255)
As described in issue #254 `&para` is a full complete character reference when decoding text node content, but not when decoding attribute content which causes problems for URL attribute values like /test?param1=foo&param2=bar As shown via JS test code in that issue, a small set of next characters prevent a character reference name match from being considered complete. This commit: - modifies the decode functions to take an extra parameter `boolean inAttribute`, and modifies the Trie traversal loops to not store a longest match so far based on that parameter and some next character tests - modifies the HTML lexer to pass that attribute appropriately - for backwards compat, leaves the old APIs in place but `@deprecated` - adds unit tests for the decode functions - adds a unit test for the specific input from the issue This change should make us more conformant with observed browser behaviour so is not expected to cause compatibility problems for existing users. Fixes #254
1 parent 06b299c commit 5372c74

File tree

5 files changed

+166
-165
lines changed

5 files changed

+166
-165
lines changed

src/main/java/org/owasp/html/Encoding.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,21 @@ public final class Encoding {
4141
*
4242
* @param s text/html
4343
* @return text/plain
44+
* @deprecated specify whether s is in an attribute value
4445
*/
4546
public static String decodeHtml(String s) {
47+
return decodeHtml(s, false);
48+
}
49+
50+
/**
51+
* Decodes HTML entities to produce a string containing only valid
52+
* Unicode scalar values.
53+
*
54+
* @param s text/html
55+
* @param inAttribute is s in an attribute value?
56+
* @return text/plain
57+
*/
58+
public static String decodeHtml(String s, boolean inAttribute) {
4659
int firstAmp = s.indexOf('&');
4760
int safeLimit = longestPrefixOfGoodCodeunits(s);
4861
if ((firstAmp & safeLimit) < 0) { return s; }
@@ -55,7 +68,7 @@ public static String decodeHtml(String s) {
5568
int amp = firstAmp;
5669
while (amp >= 0) {
5770
sb.append(s, pos, amp);
58-
int end = HtmlEntities.appendDecodedEntity(s, amp, n, sb);
71+
int end = HtmlEntities.appendDecodedEntity(s, amp, n, inAttribute, sb);
5972
pos = end;
6073
amp = s.indexOf('&', end);
6174
}

src/main/java/org/owasp/html/HtmlEntities.java

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2307,9 +2307,26 @@ final class HtmlEntities {
23072307
* in {@code html}.
23082308
* @param sb string builder to append to.
23092309
* @return The offset after the end of the decoded sequence in {@code html}.
2310+
* @deprecated specify whether html is in an attribute value.
23102311
*/
23112312
public static int appendDecodedEntity(
2312-
String html, int offset, int limit, StringBuilder sb) {
2313+
String html, int offset, int limit, StringBuilder sb) {
2314+
return appendDecodedEntity(html, offset, limit, false, sb);
2315+
}
2316+
2317+
/**
2318+
* Decodes any HTML entity at the given location and appends it to a string
2319+
* builder. This handles both named and numeric entities.
2320+
*
2321+
* @param html HTML text.
2322+
* @param offset the position of the sequence to decode in {@code html}.
2323+
* @param limit the last position that could be part of the sequence to decode
2324+
* in {@code html}.
2325+
* @param sb string builder to append to.
2326+
* @return The offset after the end of the decoded sequence in {@code html}.
2327+
*/
2328+
public static int appendDecodedEntity(
2329+
String html, int offset, int limit, boolean inAttribute, StringBuilder sb) {
23132330
char ch = html.charAt(offset);
23142331
if ('&' != ch) {
23152332
sb.append(ch);
@@ -2422,19 +2439,20 @@ public static int appendDecodedEntity(
24222439
char nameChar = html.charAt(i);
24232440
t = t.lookup(nameChar);
24242441
if (t == null) { break; }
2425-
if (t.isTerminal()) {
2442+
if (t.isTerminal() && mayComplete(inAttribute, html, i, limit)) {
24262443
longestDecode = t;
24272444
tail = i + 1;
24282445
}
24292446
}
2447+
// Try again, case insensitively.
24302448
if (longestDecode == null) {
24312449
t = ENTITY_TRIE;
24322450
for (int i = offset + 1; i < limit; ++i) {
24332451
char nameChar = html.charAt(i);
24342452
if ('Z' >= nameChar && nameChar >= 'A') { nameChar |= 32; }
24352453
t = t.lookup(nameChar);
24362454
if (t == null) { break; }
2437-
if (t.isTerminal()) {
2455+
if (t.isTerminal() && mayComplete(inAttribute, html, i, limit)) {
24382456
longestDecode = t;
24392457
tail = i + 1;
24402458
}
@@ -2456,11 +2474,35 @@ public static int appendDecodedEntity(
24562474

24572475
private static boolean isHtmlIdContinueChar(char ch) {
24582476
int chLower = ch | 32;
2459-
return ('0' <= chLower && chLower <= '9')
2477+
return ('0' <= ch && ch <= '9')
24602478
|| ('a' <= chLower && chLower <= 'z')
24612479
|| ('-' == ch);
24622480
}
24632481

2482+
/** True if the character at i in html may complete a named character reference */
2483+
private static boolean mayComplete(boolean inAttribute, String html, int i, int limit) {
2484+
if (inAttribute && html.charAt(i) != ';' && i + 1 < limit) {
2485+
// See if the next character blocks treating this as a full match.
2486+
// This avoids problems like "&para" being treated as a decoding in
2487+
// <a href="?foo&param=1">
2488+
if (continuesCharacterReferenceName(html.charAt(i + 1))) {
2489+
return false;
2490+
}
2491+
}
2492+
return true;
2493+
}
2494+
2495+
/**
2496+
* @see <a href="https://github.com/OWASP/java-html-sanitizer/issues/254#issuecomment-1080864368"
2497+
* >comments in issue 254</a>
2498+
*/
2499+
private static boolean continuesCharacterReferenceName(char ch) {
2500+
int chLower = ch | 32;
2501+
return ('0' <= ch && ch <= '9')
2502+
|| ('a' <= chLower && chLower <= 'z')
2503+
|| (ch == '=');
2504+
}
2505+
24642506
// /** A possible entity name like "amp" or "gt". */
24652507
// public static boolean isEntityName(String name) {
24662508
// Trie t = ENTITY_TRIE;

src/main/java/org/owasp/html/HtmlSanitizer.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ public static void sanitize(
144144
switch (token.type) {
145145
case TEXT:
146146
receiver.text(
147-
Encoding.decodeHtml(htmlContent.substring(token.start, token.end)));
147+
Encoding.decodeHtml(htmlContent.substring(token.start, token.end), false));
148148
break;
149149
case UNESCAPED:
150150
receiver.text(Encoding.stripBannedCodeunits(
@@ -177,8 +177,9 @@ public static void sanitize(
177177
htmlContent.substring(tagBodyToken.start, tagBodyToken.end)));
178178
break;
179179
case ATTRVALUE:
180-
attrs.add(Encoding.decodeHtml(stripQuotes(
181-
htmlContent.substring(tagBodyToken.start, tagBodyToken.end))));
180+
String attributeContentRaw =
181+
stripQuotes(htmlContent.substring(tagBodyToken.start, tagBodyToken.end));
182+
attrs.add(Encoding.decodeHtml(attributeContentRaw, true));
182183
attrsReadyForName = true;
183184
break;
184185
case TAGEND:

0 commit comments

Comments
 (0)