2
2
3
3
use super :: Utf8Error ;
4
4
use crate :: intrinsics:: { const_eval_select, unlikely} ;
5
- use crate :: mem;
6
5
7
6
/// Returns the initial codepoint accumulator for the first byte.
8
7
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
@@ -112,34 +111,33 @@ where
112
111
Some ( ch)
113
112
}
114
113
115
- // The transition table of shift-based DFA for UTF-8 validation.
114
+ // The shift-based DFA algorithm for UTF-8 validation.
116
115
// Ref: <https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725>
117
116
//
118
117
// In short, we encode DFA transitions in an array `TRANS_TABLE` such that:
119
118
// ```
120
119
// TRANS_TABLE[next_byte] =
121
- // ( target_state1 * BITS_PER_STATE) << ( source_state1 * BITS_PER_STATE) |
122
- // ( target_state2 * BITS_PER_STATE) << ( source_state2 * BITS_PER_STATE) |
120
+ // OFFSET[ target_state1] << OFFSET[ source_state1] |
121
+ // OFFSET[ target_state2] << OFFSET[ source_state2] |
123
122
// ...
124
123
// ```
125
- // Thanks to pre-multiplication, we can execute the DFA with one statement per byte:
124
+ // Where `OFFSET[]` is a compile-time map from each state to a distinct 0..32 value.
125
+ //
126
+ // To execute the DFA:
126
127
// ```
127
- // let state = initial_state * BITS_PER_STATE ;
128
+ // let state = OFFSET[ initial_state] ;
128
129
// for byte in .. {
129
130
// state = TRANS_TABLE[byte] >> (state & ((1 << BITS_PER_STATE) - 1));
130
131
// }
131
132
// ```
132
- // By choosing `BITS_PER_STATE = 6` and `state: u64`, we can replace the masking by `wrapping_shr`.
133
+ // By choosing `BITS_PER_STATE = 5` and `state: u32`, we can replace the masking by `wrapping_shr`
134
+ // and it becomes free on modern ISAs, including x86, x86_64 and ARM.
135
+ //
133
136
// ```
134
137
// // shrx state, qword ptr [table_addr + 8 * byte], state # On x86-64-v3
135
138
// state = TRANS_TABLE[byte].wrapping_shr(state);
136
139
// ```
137
140
//
138
- // On platform without 64-bit shift, especially i686, we split the `u64` next-state into
139
- // `[u32; 2]`, and each `u32` stores 5 * BITS_PER_STATE = 30 bits. In this way, state transition
140
- // can be done in only 32-bit shifts and a conditional move, which is several times faster
141
- // (in latency) than ordinary 64-bit shift (SHRD).
142
- //
143
141
// The DFA is directly derived from UTF-8 syntax from the RFC3629:
144
142
// <https://datatracker.ietf.org/doc/html/rfc3629#section-4>.
145
143
// We assign S0 as ERROR and S1 as ACCEPT. DFA starts at S1.
@@ -152,116 +150,49 @@ where
152
150
// <S1> (%xE1-EC / %xEE-EF) <S4> 2( UTF8-tail ) /
153
151
// <S1> %xED <S5> %x80-9F <S2> UTF8-tail
154
152
// UTF8-4 = <S1> %xF0 <S6> %x90-BF <S4> 2( UTF8-tail ) /
155
- // <S1> %xF4 <S7> %x80-8F <S4> 2( UTF8-tail ) /
156
- // <S1> %xF1-F3 <S8> UTF8-tail <S4> 2( UTF8-tail )
157
- //
153
+ // <S1> %xF1-F3 <S7> UTF8-tail <S4> 2( UTF8-tail ) /
154
+ // <S1> %xF4 <S8> %x80-8F <S4> 2( UTF8-tail )
158
155
// UTF8-tail = %x80-BF # Inlined into above usages.
159
- const BITS_PER_STATE : u32 = 6 ;
156
+ //
157
+ // You may notice that encoding 9 states with 5bits per state into 32bit seems impossible,
158
+ // but we exploit overlapping bits to find a possible `OFFSET[]` and `TRANS_TABLE[]` solution.
159
+ // The SAT solver to find such (minimal) solution is in `./solve_dfa.py`.
160
+ // The solution is also appended to the end of that file and is verifiable.
161
+ const BITS_PER_STATE : u32 = 5 ;
160
162
const STATE_MASK : u32 = ( 1 << BITS_PER_STATE ) - 1 ;
161
163
const STATE_CNT : usize = 9 ;
162
- #[ allow( clippy:: all) ]
163
- const ST_ERROR : u32 = 0 * BITS_PER_STATE as u32 ;
164
- #[ allow( clippy:: all) ]
165
- const ST_ACCEPT : u32 = 1 * BITS_PER_STATE as u32 ;
166
-
167
- /// Platforms that does not have efficient 64-bit shift and should use 32-bit shift fallback.
168
- const USE_SHIFT32 : bool = cfg ! ( all(
169
- any( target_pointer_width = "16" , target_pointer_width = "32" ) ,
170
- // WASM32 supports 64-bit shift.
171
- not( target_arch = "wasm32" ) ,
172
- ) ) ;
164
+ const ST_ERROR : u32 = OFFSETS [ 0 ] ;
165
+ const ST_ACCEPT : u32 = OFFSETS [ 1 ] ;
166
+ // See the end of `./solve_dfa.py`.
167
+ const OFFSETS : [ u32 ; STATE_CNT ] = [ 0 , 6 , 16 , 19 , 1 , 25 , 11 , 18 , 24 ] ;
173
168
174
- // After storing STATE_CNT * BITS_PER_STATE = 54bits on 64-bit platform, or (STATE_CNT - 5)
175
- // * BITS_PER_STATE = 24bits on 32-bit platform, we still have some high bits left.
176
- // They will never be used via state transition.
177
- // We merge lookup table from first byte -> UTF-8 length, to these highest bits.
178
- const UTF8_LEN_HIBITS : u32 = 4 ;
179
-
180
- static TRANS_TABLE : [ u64 ; 256 ] = {
181
- let mut table = [ 0u64 ; 256 ] ;
169
+ static TRANS_TABLE : [ u32 ; 256 ] = {
170
+ let mut table = [ 0u32 ; 256 ] ;
182
171
let mut b = 0 ;
183
172
while b < 256 {
184
- // Target states indexed by starting states.
185
- let mut to = [ 0u64 ; STATE_CNT ] ;
186
- to[ 0 ] = 0 ;
187
- to[ 1 ] = match b {
188
- 0x00 ..=0x7F => 1 ,
189
- 0xC2 ..=0xDF => 2 ,
190
- 0xE0 => 3 ,
191
- 0xE1 ..=0xEC | 0xEE ..=0xEF => 4 ,
192
- 0xED => 5 ,
193
- 0xF0 => 6 ,
194
- 0xF4 => 7 ,
195
- 0xF1 ..=0xF3 => 8 ,
196
- _ => 0 ,
197
- } ;
198
- to[ 2 ] = match b {
199
- 0x80 ..=0xBF => 1 ,
200
- _ => 0 ,
201
- } ;
202
- to[ 3 ] = match b {
203
- 0xA0 ..=0xBF => 2 ,
204
- _ => 0 ,
205
- } ;
206
- to[ 4 ] = match b {
207
- 0x80 ..=0xBF => 2 ,
208
- _ => 0 ,
209
- } ;
210
- to[ 5 ] = match b {
211
- 0x80 ..=0x9F => 2 ,
212
- _ => 0 ,
173
+ // See the end of `./solve_dfa.py`.
174
+ table[ b] = match b as u8 {
175
+ 0x00 ..=0x7F => 0x180 ,
176
+ 0xC2 ..=0xDF => 0x400 ,
177
+ 0xE0 => 0x4C0 ,
178
+ 0xE1 ..=0xEC | 0xEE ..=0xEF => 0x40 ,
179
+ 0xED => 0x640 ,
180
+ 0xF0 => 0x2C0 ,
181
+ 0xF1 ..=0xF3 => 0x480 ,
182
+ 0xF4 => 0x600 ,
183
+ 0x80 ..=0x8F => 0x21060020 ,
184
+ 0x90 ..=0x9F => 0x20060820 ,
185
+ 0xA0 ..=0xBF => 0x860820 ,
186
+ 0xC0 ..=0xC1 | 0xF5 ..=0xFF => 0x0 ,
213
187
} ;
214
- to[ 6 ] = match b {
215
- 0x90 ..=0xBF => 4 ,
216
- _ => 0 ,
217
- } ;
218
- to[ 7 ] = match b {
219
- 0x80 ..=0x8F => 4 ,
220
- _ => 0 ,
221
- } ;
222
- to[ 8 ] = match b {
223
- 0x80 ..=0xBF => 4 ,
224
- _ => 0 ,
225
- } ;
226
-
227
- // On platforms without 64-bit shift, align states 5..10 to 32-bit boundary.
228
- // See docs above for details.
229
- let mut bits = 0u64 ;
230
- let mut j = 0 ;
231
- while j < to. len ( ) {
232
- let to_off =
233
- to[ j] * BITS_PER_STATE as u64 + if USE_SHIFT32 && to[ j] >= 5 { 2 } else { 0 } ;
234
- let off = j as u32 * BITS_PER_STATE + if USE_SHIFT32 && j >= 5 { 2 } else { 0 } ;
235
- bits |= to_off << off;
236
- j += 1 ;
237
- }
238
-
239
- let utf8_len = match b {
240
- 0x00 ..=0x7F => 1 ,
241
- 0xC2 ..=0xDF => 2 ,
242
- 0xE0 ..=0xEF => 3 ,
243
- 0xF0 ..=0xF4 => 4 ,
244
- _ => 0 ,
245
- } ;
246
- bits |= utf8_len << ( 64 - UTF8_LEN_HIBITS ) ;
247
-
248
- table[ b] = bits;
249
188
b += 1 ;
250
189
}
251
190
table
252
191
} ;
253
192
254
193
#[ inline( always) ]
255
194
const fn next_state ( st : u32 , byte : u8 ) -> u32 {
256
- if USE_SHIFT32 {
257
- // SAFETY: `u64` is more aligned than `u32`, and has the same repr as `[u32; 2]`.
258
- let [ lo, hi] = unsafe { mem:: transmute :: < u64 , [ u32 ; 2 ] > ( TRANS_TABLE [ byte as usize ] ) } ;
259
- #[ cfg( target_endian = "big" ) ]
260
- let ( lo, hi) = ( hi, lo) ;
261
- if st & 32 == 0 { lo } else { hi } . wrapping_shr ( st)
262
- } else {
263
- TRANS_TABLE [ byte as usize ] . wrapping_shr ( st as _ ) as _
264
- }
195
+ TRANS_TABLE [ byte as usize ] . wrapping_shr ( st)
265
196
}
266
197
267
198
/// Check if `byte` is a valid UTF-8 first byte, assuming it must be a valid first or
@@ -407,13 +338,33 @@ fn run_utf8_validation_rt(bytes: &[u8]) -> Result<(), Utf8Error> {
407
338
Ok ( ( ) )
408
339
}
409
340
341
+ // https://tools.ietf.org/html/rfc3629
342
+ const UTF8_CHAR_WIDTH : & [ u8 ; 256 ] = & [
343
+ // 1 2 3 4 5 6 7 8 9 A B C D E F
344
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0
345
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 1
346
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 2
347
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 3
348
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 4
349
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 5
350
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 6
351
+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 7
352
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 8
353
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 9
354
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // A
355
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // B
356
+ 0 , 0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , // C
357
+ 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , // D
358
+ 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , // E
359
+ 4 , 4 , 4 , 4 , 4 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // F
360
+ ] ;
361
+
410
362
/// Given a first byte, determines how many bytes are in this UTF-8 character.
411
363
#[ unstable( feature = "str_internals" , issue = "none" ) ]
412
364
#[ must_use]
413
365
#[ inline]
414
366
pub const fn utf8_char_width ( b : u8 ) -> usize {
415
- // On 32-bit platforms, optimizer is smart enough to only load and operate on the high 32-bits.
416
- ( TRANS_TABLE [ b as usize ] >> ( 64 - UTF8_LEN_HIBITS ) ) as usize
367
+ UTF8_CHAR_WIDTH [ b as usize ] as usize
417
368
}
418
369
419
370
/// Mask of the value bits of a continuation byte.
0 commit comments