OWASP · mikesamuel · Jun 8, 2022 · Mar 28, 2022
diff --git a/src/main/java/org/owasp/html/Encoding.java b/src/main/java/org/owasp/html/Encoding.java
@@ -41,8 +41,21 @@ public final class Encoding {
    *
    * @param s text/html
    * @return text/plain
+   * @deprecated specify whether s is in an attribute value
    */
   public static String decodeHtml(String s) {
+    return decodeHtml(s, false);
+  }
+
+  /**
+   * Decodes HTML entities to produce a string containing only valid
+   * Unicode scalar values.
+   *
+   * @param s text/html
+   * @param inAttribute is s in an attribute value?
+   * @return text/plain
+   */
+  public static String decodeHtml(String s, boolean inAttribute) {
     int firstAmp = s.indexOf('&');
     int safeLimit = longestPrefixOfGoodCodeunits(s);
     if ((firstAmp & safeLimit) < 0) { return s; }
@@ -55,7 +68,7 @@ public static String decodeHtml(String s) {
       int amp = firstAmp;
       while (amp >= 0) {
         sb.append(s, pos, amp);
-        int end = HtmlEntities.appendDecodedEntity(s, amp, n, sb);
+        int end = HtmlEntities.appendDecodedEntity(s, amp, n, inAttribute, sb);
         pos = end;
         amp = s.indexOf('&', end);
       }

diff --git a/src/main/java/org/owasp/html/HtmlEntities.java b/src/main/java/org/owasp/html/HtmlEntities.java
@@ -2307,9 +2307,26 @@ final class HtmlEntities {
    *    in {@code html}.
    * @param sb string builder to append to.
    * @return The offset after the end of the decoded sequence in {@code html}.
+   * @deprecated specify whether html is in an attribute value.
    */
   public static int appendDecodedEntity(
-      String html, int offset, int limit, StringBuilder sb) {
+     String html, int offset, int limit, StringBuilder sb) {
+    return appendDecodedEntity(html, offset, limit, false, sb);
+  }
+
+  /**
+   * Decodes any HTML entity at the given location and appends it to a string
+   * builder.  This handles both named and numeric entities.
+   *
+   * @param html HTML text.
+   * @param offset the position of the sequence to decode in {@code html}.
+   * @param limit the last position that could be part of the sequence to decode
+   *    in {@code html}.
+   * @param sb string builder to append to.
+   * @return The offset after the end of the decoded sequence in {@code html}.
+   */
+  public static int appendDecodedEntity(
+      String html, int offset, int limit, boolean inAttribute, StringBuilder sb) {
     char ch = html.charAt(offset);
     if ('&' != ch) {
       sb.append(ch);
@@ -2422,19 +2439,20 @@ public static int appendDecodedEntity(
         char nameChar = html.charAt(i);
         t = t.lookup(nameChar);
         if (t == null) { break; }
-        if (t.isTerminal()) {
+        if (t.isTerminal() && mayComplete(inAttribute, html, i, limit)) {
           longestDecode = t;
           tail = i + 1;
         }
       }
+      // Try again, case insensitively.
       if (longestDecode == null) {
         t = ENTITY_TRIE;
         for (int i = offset + 1; i < limit; ++i) {
           char nameChar = html.charAt(i);
           if ('Z' >= nameChar && nameChar >= 'A') { nameChar |= 32; }
           t = t.lookup(nameChar);
           if (t == null) { break; }
-          if (t.isTerminal()) {
+          if (t.isTerminal() && mayComplete(inAttribute, html, i, limit)) {
             longestDecode = t;
             tail = i + 1;
           }
@@ -2456,11 +2474,35 @@ public static int appendDecodedEntity(
 
   private static boolean isHtmlIdContinueChar(char ch) {
     int chLower = ch | 32;
-    return ('0' <= chLower && chLower <= '9')
+    return ('0' <= ch && ch <= '9')
             || ('a' <= chLower && chLower <= 'z')
             || ('-' == ch);
   }
 
+  /** True if the character at i in html may complete a named character reference */
+  private static boolean mayComplete(boolean inAttribute, String html, int i, int limit) {
+    if (inAttribute && html.charAt(i) != ';' && i + 1 < limit) {
+      // See if the next character blocks treating this as a full match.
+      // This avoids problems like "&para" being treated as a decoding in
+      //     <a href="?foo&param=1">
+      if (continuesCharacterReferenceName(html.charAt(i + 1))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /**
+   * @see <a href="https://github.com/OWASP/java-html-sanitizer/issues/254#issuecomment-1080864368"
+   * >comments in issue 254</a>
+   */
+  private static boolean continuesCharacterReferenceName(char ch) {
+    int chLower = ch | 32;
+    return ('0' <= ch && ch <= '9')
+            || ('a' <= chLower && chLower <= 'z')
+            || (ch == '=');
+  }
+
 //  /** A possible entity name like "amp" or "gt". */
 //  public static boolean isEntityName(String name) {
 //    Trie t = ENTITY_TRIE;

diff --git a/src/main/java/org/owasp/html/HtmlSanitizer.java b/src/main/java/org/owasp/html/HtmlSanitizer.java
@@ -144,7 +144,7 @@ public static void sanitize(
       switch (token.type) {
         case TEXT:
           receiver.text(
-              Encoding.decodeHtml(htmlContent.substring(token.start, token.end)));
+              Encoding.decodeHtml(htmlContent.substring(token.start, token.end), false));
           break;
         case UNESCAPED:
           receiver.text(Encoding.stripBannedCodeunits(
@@ -177,8 +177,9 @@ public static void sanitize(
                       htmlContent.substring(tagBodyToken.start, tagBodyToken.end)));
                   break;
                 case ATTRVALUE:
-                  attrs.add(Encoding.decodeHtml(stripQuotes(
-                      htmlContent.substring(tagBodyToken.start, tagBodyToken.end))));
+                  String attributeContentRaw =
+                          stripQuotes(htmlContent.substring(tagBodyToken.start, tagBodyToken.end));
+                  attrs.add(Encoding.decodeHtml(attributeContentRaw, true));
                   attrsReadyForName = true;
                   break;
                 case TAGEND: