Skip to content

Decode attribute content differently from text node content #255

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion src/main/java/org/owasp/html/Encoding.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,21 @@ public final class Encoding {
*
* @param s text/html
* @return text/plain
* @deprecated specify whether s is in an attribute value
*/
public static String decodeHtml(String s) {
return decodeHtml(s, false);
}

/**
* Decodes HTML entities to produce a string containing only valid
* Unicode scalar values.
*
* @param s text/html
* @param inAttribute is s in an attribute value?
* @return text/plain
*/
public static String decodeHtml(String s, boolean inAttribute) {
int firstAmp = s.indexOf('&');
int safeLimit = longestPrefixOfGoodCodeunits(s);
if ((firstAmp & safeLimit) < 0) { return s; }
Expand All @@ -55,7 +68,7 @@ public static String decodeHtml(String s) {
int amp = firstAmp;
while (amp >= 0) {
sb.append(s, pos, amp);
int end = HtmlEntities.appendDecodedEntity(s, amp, n, sb);
int end = HtmlEntities.appendDecodedEntity(s, amp, n, inAttribute, sb);
pos = end;
amp = s.indexOf('&', end);
}
Expand Down
50 changes: 46 additions & 4 deletions src/main/java/org/owasp/html/HtmlEntities.java
Original file line number Diff line number Diff line change
Expand Up @@ -2307,9 +2307,26 @@ final class HtmlEntities {
* in {@code html}.
* @param sb string builder to append to.
* @return The offset after the end of the decoded sequence in {@code html}.
* @deprecated specify whether html is in an attribute value.
*/
public static int appendDecodedEntity(
String html, int offset, int limit, StringBuilder sb) {
String html, int offset, int limit, StringBuilder sb) {
return appendDecodedEntity(html, offset, limit, false, sb);
}

/**
* Decodes any HTML entity at the given location and appends it to a string
* builder. This handles both named and numeric entities.
*
* @param html HTML text.
* @param offset the position of the sequence to decode in {@code html}.
* @param limit the last position that could be part of the sequence to decode
* in {@code html}.
* @param sb string builder to append to.
* @return The offset after the end of the decoded sequence in {@code html}.
*/
public static int appendDecodedEntity(
String html, int offset, int limit, boolean inAttribute, StringBuilder sb) {
char ch = html.charAt(offset);
if ('&' != ch) {
sb.append(ch);
Expand Down Expand Up @@ -2422,19 +2439,20 @@ public static int appendDecodedEntity(
char nameChar = html.charAt(i);
t = t.lookup(nameChar);
if (t == null) { break; }
if (t.isTerminal()) {
if (t.isTerminal() && mayComplete(inAttribute, html, i, limit)) {
longestDecode = t;
tail = i + 1;
}
}
// Try again, case insensitively.
if (longestDecode == null) {
t = ENTITY_TRIE;
for (int i = offset + 1; i < limit; ++i) {
char nameChar = html.charAt(i);
if ('Z' >= nameChar && nameChar >= 'A') { nameChar |= 32; }
t = t.lookup(nameChar);
if (t == null) { break; }
if (t.isTerminal()) {
if (t.isTerminal() && mayComplete(inAttribute, html, i, limit)) {
longestDecode = t;
tail = i + 1;
}
Expand All @@ -2456,11 +2474,35 @@ public static int appendDecodedEntity(

private static boolean isHtmlIdContinueChar(char ch) {
int chLower = ch | 32;
return ('0' <= chLower && chLower <= '9')
return ('0' <= ch && ch <= '9')
|| ('a' <= chLower && chLower <= 'z')
|| ('-' == ch);
}

/** True if the character at i in html may complete a named character reference */
private static boolean mayComplete(boolean inAttribute, String html, int i, int limit) {
if (inAttribute && html.charAt(i) != ';' && i + 1 < limit) {
// See if the next character blocks treating this as a full match.
// This avoids problems like "&para" being treated as a decoding in
// <a href="?foo&param=1">
if (continuesCharacterReferenceName(html.charAt(i + 1))) {
return false;
}
}
return true;
}

/**
* @see <a href="https://github.com/OWASP/java-html-sanitizer/issues/254#issuecomment-1080864368"
* >comments in issue 254</a>
*/
private static boolean continuesCharacterReferenceName(char ch) {
int chLower = ch | 32;
return ('0' <= ch && ch <= '9')
|| ('a' <= chLower && chLower <= 'z')
|| (ch == '=');
}

// /** A possible entity name like "amp" or "gt". */
// public static boolean isEntityName(String name) {
// Trie t = ENTITY_TRIE;
Expand Down
7 changes: 4 additions & 3 deletions src/main/java/org/owasp/html/HtmlSanitizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ public static void sanitize(
switch (token.type) {
case TEXT:
receiver.text(
Encoding.decodeHtml(htmlContent.substring(token.start, token.end)));
Encoding.decodeHtml(htmlContent.substring(token.start, token.end), false));
break;
case UNESCAPED:
receiver.text(Encoding.stripBannedCodeunits(
Expand Down Expand Up @@ -177,8 +177,9 @@ public static void sanitize(
htmlContent.substring(tagBodyToken.start, tagBodyToken.end)));
break;
case ATTRVALUE:
attrs.add(Encoding.decodeHtml(stripQuotes(
htmlContent.substring(tagBodyToken.start, tagBodyToken.end))));
String attributeContentRaw =
stripQuotes(htmlContent.substring(tagBodyToken.start, tagBodyToken.end));
attrs.add(Encoding.decodeHtml(attributeContentRaw, true));
attrsReadyForName = true;
break;
case TAGEND:
Expand Down
Loading