Skip to content

Commit 4dcc4ac

Browse files
authored
Merge pull request #2426 from pinkjuice66/parser-correct-utf8validation
[Parser] Correct the start byte range for UTF8 characters.
2 parents a7fa220 + 38bfc61 commit 4dcc4ac

File tree

2 files changed

+39
-13
lines changed

2 files changed

+39
-13
lines changed

Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -156,12 +156,6 @@ extension Unicode.Scalar {
156156
// including and above the DEL character U+7F.
157157
return self.value >= 0x20 && self.value < 0x7F
158158
}
159-
160-
var isStartOfUTF8Character: Bool {
161-
// RFC 2279: The octet values FE and FF never appear.
162-
// RFC 3629: The octet values C0, C1, F5 to FF never appear.
163-
return self.value <= 0x80 || (self.value >= 0xC2 && self.value < 0xF5)
164-
}
165159
}
166160

167161
extension Unicode.Scalar {
@@ -179,20 +173,25 @@ extension Unicode.Scalar {
179173
return Unicode.Scalar(curByte)
180174
}
181175

182-
// Read the number of high bits set, which indicates the number of bytes in
183-
// the character.
184-
let encodedBytes = (~(UInt32(curByte) << 24)).leadingZeroBitCount
185-
186-
// If this is 0b10XXXXXX, then it is a continuation character.
187-
if encodedBytes == 1 || !Unicode.Scalar(curByte).isStartOfUTF8Character {
176+
// If this is not the start of a UTF8 character,
177+
// then it is either a continuation byte or an invalid UTF8 code point.
178+
if !curByte.isStartOfUTF8Character {
188179
// Skip until we get the start of another character. This is guaranteed to
189180
// at least stop at the nul at the end of the buffer.
190-
while let peeked = peek(), !Unicode.Scalar(peeked).isStartOfUTF8Character {
181+
while let peeked = peek(), !peeked.isStartOfUTF8Character {
191182
_ = advance()
192183
}
193184
return nil
194185
}
195186

187+
// Read the number of high bits set, which indicates the number of bytes in
188+
// the character.
189+
let encodedBytes = (~curByte).leadingZeroBitCount
190+
// We have a multi-byte UTF-8 scalar.
191+
// Single-byte UTF-8 scalars are handled at the start of the function by checking `curByte < 0x80`.
192+
// `isStartOfUTF8Character` guaranteed that the `curByte` has 2 to 4 leading ones.
193+
precondition(encodedBytes >= 2 && encodedBytes <= 4)
194+
196195
// Drop the high bits indicating the # bytes of the result.
197196
var charValue = UInt32(curByte << encodedBytes) >> encodedBytes
198197

@@ -252,3 +251,11 @@ extension Unicode.Scalar {
252251
return self.lexing(advance: advance, peek: peek)
253252
}
254253
}
254+
255+
extension UInt8 {
256+
var isStartOfUTF8Character: Bool {
257+
// RFC 2279: The octet values FE and FF never appear.
258+
// RFC 3629: The octet values C0, C1, F5 to FF never appear.
259+
return self < 0x80 || (self >= 0xC2 && self < 0xF5)
260+
}
261+
}

Tests/SwiftParserTest/LexerTests.swift

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1504,4 +1504,23 @@ public class LexerTests: ParserTestCase {
15041504
]
15051505
)
15061506
}
1507+
1508+
func testUnicodeContainTheEdgeContinuationByte() {
1509+
// A continuation byte must be in the range greater than or
1510+
// equal to 0x80 and less than or equal to 0xBF
1511+
1512+
// À(0xC3 0x80), 㗀(0xE3 0x97 0x80), 🀀(0xF0 0x9F 0x80 0x80),
1513+
// ÿ(0xC3 0xBF), 俿(0xE4 0xBF 0xBF), 𐐿(0xF0 0x90 0x90 0xBF)
1514+
assertLexemes(
1515+
"À 㗀 🀀 ÿ 俿 𐐿",
1516+
lexemes: [
1517+
LexemeSpec(.identifier, text: "À", trailing: " "),
1518+
LexemeSpec(.identifier, text: "", trailing: " "),
1519+
LexemeSpec(.identifier, text: "🀀", trailing: " "),
1520+
LexemeSpec(.identifier, text: "ÿ", trailing: " "),
1521+
LexemeSpec(.identifier, text: "俿", trailing: " "),
1522+
LexemeSpec(.identifier, text: "𐐿"),
1523+
]
1524+
)
1525+
}
15071526
}

0 commit comments

Comments
 (0)