@@ -156,12 +156,6 @@ extension Unicode.Scalar {
156
156
// including and above the DEL character U+7F.
157
157
return self . value >= 0x20 && self . value < 0x7F
158
158
}
159
-
160
- var isStartOfUTF8Character : Bool {
161
- // RFC 2279: The octet values FE and FF never appear.
162
- // RFC 3629: The octet values C0, C1, F5 to FF never appear.
163
- return self . value <= 0x80 || ( self . value >= 0xC2 && self . value < 0xF5 )
164
- }
165
159
}
166
160
167
161
extension Unicode . Scalar {
@@ -179,20 +173,25 @@ extension Unicode.Scalar {
179
173
return Unicode . Scalar ( curByte)
180
174
}
181
175
182
- // Read the number of high bits set, which indicates the number of bytes in
183
- // the character.
184
- let encodedBytes = ( ~ ( UInt32 ( curByte) << 24 ) ) . leadingZeroBitCount
185
-
186
- // If this is 0b10XXXXXX, then it is a continuation character.
187
- if encodedBytes == 1 || !Unicode. Scalar ( curByte) . isStartOfUTF8Character {
176
+ // If this is not the start of a UTF8 character,
177
+ // then it is either a continuation byte or an invalid UTF8 code point.
178
+ if !curByte. isStartOfUTF8Character {
188
179
// Skip until we get the start of another character. This is guaranteed to
189
180
// at least stop at the nul at the end of the buffer.
190
- while let peeked = peek ( ) , !Unicode . Scalar ( peeked) . isStartOfUTF8Character {
181
+ while let peeked = peek ( ) , !peeked. isStartOfUTF8Character {
191
182
_ = advance ( )
192
183
}
193
184
return nil
194
185
}
195
186
187
+ // Read the number of high bits set, which indicates the number of bytes in
188
+ // the character.
189
+ let encodedBytes = ( ~ curByte) . leadingZeroBitCount
190
+ // We have a multi-byte UTF-8 scalar.
191
+ // Single-byte UTF-8 scalars are handled at the start of the function by checking `curByte < 0x80`.
192
+ // `isStartOfUTF8Character` guaranteed that the `curByte` has 2 to 4 leading ones.
193
+ precondition ( encodedBytes >= 2 && encodedBytes <= 4 )
194
+
196
195
// Drop the high bits indicating the # bytes of the result.
197
196
var charValue = UInt32 ( curByte << encodedBytes) >> encodedBytes
198
197
@@ -252,3 +251,11 @@ extension Unicode.Scalar {
252
251
return self . lexing ( advance: advance, peek: peek)
253
252
}
254
253
}
254
+
255
+ extension UInt8 {
256
+ var isStartOfUTF8Character : Bool {
257
+ // RFC 2279: The octet values FE and FF never appear.
258
+ // RFC 3629: The octet values C0, C1, F5 to FF never appear.
259
+ return self < 0x80 || ( self >= 0xC2 && self < 0xF5 )
260
+ }
261
+ }
0 commit comments