Merge pull request #2426 from pinkjuice66/parser-correct-utf8validation

ahoppen · web-flow · commit 4dcc4ac199c2 · 2024-01-12T13:28:53.000-08:00
[Parser] Correct the start byte range for UTF8 characters.
diff --git a/Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift b/Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift
@@ -156,12 +156,6 @@ extension Unicode.Scalar {
     // including and above the DEL character U+7F.
     return self.value >= 0x20 && self.value < 0x7F
   }
-
-  var isStartOfUTF8Character: Bool {
-    // RFC 2279: The octet values FE and FF never appear.
-    // RFC 3629: The octet values C0, C1, F5 to FF never appear.
-    return self.value <= 0x80 || (self.value >= 0xC2 && self.value < 0xF5)
-  }
 }
 
 extension Unicode.Scalar {
@@ -179,20 +173,25 @@ extension Unicode.Scalar {
       return Unicode.Scalar(curByte)
     }
 
-    // Read the number of high bits set, which indicates the number of bytes in
-    // the character.
-    let encodedBytes = (~(UInt32(curByte) << 24)).leadingZeroBitCount
-
-    // If this is 0b10XXXXXX, then it is a continuation character.
-    if encodedBytes == 1 || !Unicode.Scalar(curByte).isStartOfUTF8Character {
+    // If this is not the start of a UTF8 character,
+    // then it is either a continuation byte or an invalid UTF8 code point.
+    if !curByte.isStartOfUTF8Character {
       // Skip until we get the start of another character.  This is guaranteed to
       // at least stop at the nul at the end of the buffer.
-      while let peeked = peek(), !Unicode.Scalar(peeked).isStartOfUTF8Character {
+      while let peeked = peek(), !peeked.isStartOfUTF8Character {
         _ = advance()
       }
       return nil
     }
 
+    // Read the number of high bits set, which indicates the number of bytes in
+    // the character.
+    let encodedBytes = (~curByte).leadingZeroBitCount
+    // We have a multi-byte UTF-8 scalar.
+    // Single-byte UTF-8 scalars are handled at the start of the function by checking `curByte < 0x80`.
+    // `isStartOfUTF8Character` guaranteed that the `curByte` has 2 to 4 leading ones.
+    precondition(encodedBytes >= 2 && encodedBytes <= 4)
+
     // Drop the high bits indicating the # bytes of the result.
     var charValue = UInt32(curByte << encodedBytes) >> encodedBytes
 
@@ -252,3 +251,11 @@ extension Unicode.Scalar {
     return self.lexing(advance: advance, peek: peek)
   }
 }
+
+extension UInt8 {
+  var isStartOfUTF8Character: Bool {
+    // RFC 2279: The octet values FE and FF never appear.
+    // RFC 3629: The octet values C0, C1, F5 to FF never appear.
+    return self < 0x80 || (self >= 0xC2 && self < 0xF5)
+  }
+}
diff --git a/Tests/SwiftParserTest/LexerTests.swift b/Tests/SwiftParserTest/LexerTests.swift
@@ -1504,4 +1504,23 @@ public class LexerTests: ParserTestCase {
       ]
     )
   }
+
+  func testUnicodeContainTheEdgeContinuationByte() {
+    // A continuation byte must be in the range greater than or
+    // equal to 0x80 and less than or equal to 0xBF
+
+    // À(0xC3 0x80), 㗀(0xE3 0x97 0x80), 🀀(0xF0 0x9F 0x80 0x80),
+    // ÿ(0xC3 0xBF), 俿(0xE4 0xBF 0xBF), 𐐿(0xF0 0x90 0x90 0xBF)
+    assertLexemes(
+      "À 㗀 🀀 ÿ 俿 𐐿",
+      lexemes: [
+        LexemeSpec(.identifier, text: "À", trailing: " "),
+        LexemeSpec(.identifier, text: "㗀", trailing: " "),
+        LexemeSpec(.identifier, text: "🀀", trailing: " "),
+        LexemeSpec(.identifier, text: "ÿ", trailing: " "),
+        LexemeSpec(.identifier, text: "俿", trailing: " "),
+        LexemeSpec(.identifier, text: "𐐿"),
+      ]
+    )
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -1504,4 +1504,23 @@ public class LexerTests: ParserTestCase {`
`1504`	`1504`	`]`
`1505`	`1505`	`)`
`1506`	`1506`	`}`
	`1507`	`+`
	`1508`	`+ func testUnicodeContainTheEdgeContinuationByte() {`
	`1509`	`+ // A continuation byte must be in the range greater than or`
	`1510`	`+ // equal to 0x80 and less than or equal to 0xBF`
	`1511`	`+`
	`1512`	`+ // À(0xC3 0x80), 㗀(0xE3 0x97 0x80), 🀀(0xF0 0x9F 0x80 0x80),`
	`1513`	`+ // ÿ(0xC3 0xBF), 俿(0xE4 0xBF 0xBF), 𐐿(0xF0 0x90 0x90 0xBF)`
	`1514`	`+ assertLexemes(`
	`1515`	`+ "À 㗀 🀀 ÿ 俿 𐐿",`
	`1516`	`+ lexemes: [`
	`1517`	`+ LexemeSpec(.identifier, text: "À", trailing: " "),`
	`1518`	`+ LexemeSpec(.identifier, text: "㗀", trailing: " "),`
	`1519`	`+ LexemeSpec(.identifier, text: "🀀", trailing: " "),`
	`1520`	`+ LexemeSpec(.identifier, text: "ÿ", trailing: " "),`
	`1521`	`+ LexemeSpec(.identifier, text: "俿", trailing: " "),`
	`1522`	`+ LexemeSpec(.identifier, text: "𐐿"),`
	`1523`	`+ ]`
	`1524`	`+ )`
	`1525`	`+ }`
`1507`	`1526`	`}`