Use z3 to find a state mapping that only need u32 transition table

oxalica · oxalica · commit 16d31927701e · 2025-02-08T20:11:00.000-05:00
This shrinks the table size to 1KiB for less cache pressure, and
produces faster code on platforms that only support 32-bit shift.

Though, it does not affect the throughput on 64-bit platforms when the
table is already fully in cache.
diff --git a/library/core/src/str/solve_dfa.py b/library/core/src/str/solve_dfa.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# Use z3 to solve UTF-8 validation DFA for offset and transition table,
+# in order to encode transition table into u32.
+# We minimize the output variables in the solution to make it deterministic.
+# Ref: <https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f>
+# See more detail explanation in `./validations.rs`.
+#
+# It is expected to find a solution in <30s on a modern machine, and the
+# solution is appended to the end of this file.
+from z3 import *
+
+STATE_CNT = 9
+
+# The transition table.
+# A value X on column Y means state Y should transition to state X on some
+# input bytes. We assign state 0 as ERROR and state 1 as ACCEPT (initial).
+# Eg. first line: for input byte 00..=7F, transition S1 -> S1, others -> S0.
+TRANSITIONS = [
+    # 0  1  2  3  4  5  6  7  8
+    # First bytes
+    ((0, 1, 0, 0, 0, 0, 0, 0, 0), "00-7F"),
+    ((0, 2, 0, 0, 0, 0, 0, 0, 0), "C2-DF"),
+    ((0, 3, 0, 0, 0, 0, 0, 0, 0), "E0"),
+    ((0, 4, 0, 0, 0, 0, 0, 0, 0), "E1-EC, EE-EF"),
+    ((0, 5, 0, 0, 0, 0, 0, 0, 0), "ED"),
+    ((0, 6, 0, 0, 0, 0, 0, 0, 0), "F0"),
+    ((0, 7, 0, 0, 0, 0, 0, 0, 0), "F1-F3"),
+    ((0, 8, 0, 0, 0, 0, 0, 0, 0), "F4"),
+    # Continuation bytes
+    ((0, 0, 1, 0, 2, 2, 0, 4, 4), "80-8F"),
+    ((0, 0, 1, 0, 2, 2, 4, 4, 0), "90-9F"),
+    ((0, 0, 1, 2, 2, 0, 4, 4, 0), "A0-BF"),
+    # Illegal
+    ((0, 0, 0, 0, 0, 0, 0, 0, 0), "C0-C1, F5-FF"),
+]
+
+o = Optimize()
+offsets = [BitVec(f'o{i}', 32) for i in range(STATE_CNT)]
+trans_table = [BitVec(f't{i}', 32) for i in range(len(TRANSITIONS))]
+
+# Add some guiding constraints to make solving faster.
+o.add(offsets[0] == 0)
+o.add(trans_table[-1] == 0)
+
+for i in range(len(offsets)):
+    o.add(offsets[i] < 32 - 5) # Do not over-shift. It's not necessary but makes solving faster.
+    for j in range(i):
+        o.add(offsets[i] != offsets[j])
+for trans, (targets, _) in zip(trans_table, TRANSITIONS):
+    for src, tgt in enumerate(targets):
+        o.add((LShR(trans, offsets[src]) & 31) == offsets[tgt])
+
+# Minimize ordered outputs to get a unique solution.
+goal = Concat(*offsets, *trans_table)
+o.minimize(goal)
+print(o.check())
+print('Offset[]= ', [o.model()[i].as_long() for i in offsets])
+print('Transitions:')
+for (_, label), v in zip(TRANSITIONS, [o.model()[i].as_long() for i in trans_table]):
+    print(f'{label:14} => {v:#10x}, // {v:032b}')
+
+# Output should be deterministic:
+# sat
+# Offset[]=  [0, 6, 16, 19, 1, 25, 11, 18, 24]
+# Transitions:
+# 00-7F          =>      0x180, // 00000000000000000000000110000000
+# C2-DF          =>      0x400, // 00000000000000000000010000000000
+# E0             =>      0x4c0, // 00000000000000000000010011000000
+# E1-EC, EE-EF   =>       0x40, // 00000000000000000000000001000000
+# ED             =>      0x640, // 00000000000000000000011001000000
+# F0             =>      0x2c0, // 00000000000000000000001011000000
+# F1-F3          =>      0x480, // 00000000000000000000010010000000
+# F4             =>      0x600, // 00000000000000000000011000000000
+# 80-8F          => 0x21060020, // 00100001000001100000000000100000
+# 90-9F          => 0x20060820, // 00100000000001100000100000100000
+# A0-BF          =>   0x860820, // 00000000100001100000100000100000
+# C0-C1, F5-FF   =>        0x0, // 00000000000000000000000000000000
diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs
@@ -2,7 +2,6 @@
 
 use super::Utf8Error;
 use crate::intrinsics::{const_eval_select, unlikely};
-use crate::mem;
 
 /// Returns the initial codepoint accumulator for the first byte.
 /// The first byte is special, only want bottom 5 bits for width 2, 4 bits
@@ -112,34 +111,33 @@ where
     Some(ch)
 }
 
-// The transition table of shift-based DFA for UTF-8 validation.
+// The shift-based DFA algorithm for UTF-8 validation.
 // Ref: <https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725>
 //
 // In short, we encode DFA transitions in an array `TRANS_TABLE` such that:
 // ```
 // TRANS_TABLE[next_byte] =
-//     (target_state1 * BITS_PER_STATE) << (source_state1 * BITS_PER_STATE) |
-//     (target_state2 * BITS_PER_STATE) << (source_state2 * BITS_PER_STATE) |
+//     OFFSET[target_state1] << OFFSET[source_state1] |
+//     OFFSET[target_state2] << OFFSET[source_state2] |
 //     ...
 // ```
-// Thanks to pre-multiplication, we can execute the DFA with one statement per byte:
+// Where `OFFSET[]` is a compile-time map from each state to a distinct 0..32 value.
+//
+// To execute the DFA:
 // ```
-// let state = initial_state * BITS_PER_STATE;
+// let state = OFFSET[initial_state];
 // for byte in .. {
 //     state = TRANS_TABLE[byte] >> (state & ((1 << BITS_PER_STATE) - 1));
 // }
 // ```
-// By choosing `BITS_PER_STATE = 6` and `state: u64`, we can replace the masking by `wrapping_shr`.
+// By choosing `BITS_PER_STATE = 5` and `state: u32`, we can replace the masking by `wrapping_shr`
+// and it becomes free on modern ISAs, including x86, x86_64 and ARM.
+//
 // ```
 // // shrx state, qword ptr [table_addr + 8 * byte], state   # On x86-64-v3
 // state = TRANS_TABLE[byte].wrapping_shr(state);
 // ```
 //
-// On platform without 64-bit shift, especially i686, we split the `u64` next-state into
-// `[u32; 2]`, and each `u32` stores 5 * BITS_PER_STATE = 30 bits. In this way, state transition
-// can be done in only 32-bit shifts and a conditional move, which is several times faster
-// (in latency) than ordinary 64-bit shift (SHRD).
-//
 // The DFA is directly derived from UTF-8 syntax from the RFC3629:
 // <https://datatracker.ietf.org/doc/html/rfc3629#section-4>.
 // We assign S0 as ERROR and S1 as ACCEPT. DFA starts at S1.
@@ -152,116 +150,49 @@ where
 //               <S1> (%xE1-EC / %xEE-EF)    <S4> 2( UTF8-tail ) /
 //               <S1> %xED                   <S5> %x80-9F <S2> UTF8-tail
 // UTF8-4      = <S1> %xF0    <S6> %x90-BF   <S4> 2( UTF8-tail ) /
-//               <S1> %xF4    <S7> %x80-8F   <S4> 2( UTF8-tail ) /
-//               <S1> %xF1-F3 <S8> UTF8-tail <S4> 2( UTF8-tail )
-//
+//               <S1> %xF1-F3 <S7> UTF8-tail <S4> 2( UTF8-tail ) /
+//               <S1> %xF4    <S8> %x80-8F   <S4> 2( UTF8-tail )
 // UTF8-tail   = %x80-BF   # Inlined into above usages.
-const BITS_PER_STATE: u32 = 6;
+//
+// You may notice that encoding 9 states with 5bits per state into 32bit seems impossible,
+// but we exploit overlapping bits to find a possible `OFFSET[]` and `TRANS_TABLE[]` solution.
+// The SAT solver to find such (minimal) solution is in `./solve_dfa.py`.
+// The solution is also appended to the end of that file and is verifiable.
+const BITS_PER_STATE: u32 = 5;
 const STATE_MASK: u32 = (1 << BITS_PER_STATE) - 1;
 const STATE_CNT: usize = 9;
-#[allow(clippy::all)]
-const ST_ERROR: u32 = 0 * BITS_PER_STATE as u32;
-#[allow(clippy::all)]
-const ST_ACCEPT: u32 = 1 * BITS_PER_STATE as u32;
-
-/// Platforms that does not have efficient 64-bit shift and should use 32-bit shift fallback.
-const USE_SHIFT32: bool = cfg!(all(
-    any(target_pointer_width = "16", target_pointer_width = "32"),
-    // WASM32 supports 64-bit shift.
-    not(target_arch = "wasm32"),
-));
+const ST_ERROR: u32 = OFFSETS[0];
+const ST_ACCEPT: u32 = OFFSETS[1];
+// See the end of `./solve_dfa.py`.
+const OFFSETS: [u32; STATE_CNT] = [0, 6, 16, 19, 1, 25, 11, 18, 24];
 
-// After storing STATE_CNT * BITS_PER_STATE = 54bits on 64-bit platform, or (STATE_CNT - 5)
-// * BITS_PER_STATE = 24bits on 32-bit platform, we still have some high bits left.
-// They will never be used via state transition.
-// We merge lookup table from first byte -> UTF-8 length, to these highest bits.
-const UTF8_LEN_HIBITS: u32 = 4;
-
-static TRANS_TABLE: [u64; 256] = {
-    let mut table = [0u64; 256];
+static TRANS_TABLE: [u32; 256] = {
+    let mut table = [0u32; 256];
     let mut b = 0;
     while b < 256 {
-        // Target states indexed by starting states.
-        let mut to = [0u64; STATE_CNT];
-        to[0] = 0;
-        to[1] = match b {
-            0x00..=0x7F => 1,
-            0xC2..=0xDF => 2,
-            0xE0 => 3,
-            0xE1..=0xEC | 0xEE..=0xEF => 4,
-            0xED => 5,
-            0xF0 => 6,
-            0xF4 => 7,
-            0xF1..=0xF3 => 8,
-            _ => 0,
-        };
-        to[2] = match b {
-            0x80..=0xBF => 1,
-            _ => 0,
-        };
-        to[3] = match b {
-            0xA0..=0xBF => 2,
-            _ => 0,
-        };
-        to[4] = match b {
-            0x80..=0xBF => 2,
-            _ => 0,
-        };
-        to[5] = match b {
-            0x80..=0x9F => 2,
-            _ => 0,
+        // See the end of `./solve_dfa.py`.
+        table[b] = match b as u8 {
+            0x00..=0x7F => 0x180,
+            0xC2..=0xDF => 0x400,
+            0xE0 => 0x4C0,
+            0xE1..=0xEC | 0xEE..=0xEF => 0x40,
+            0xED => 0x640,
+            0xF0 => 0x2C0,
+            0xF1..=0xF3 => 0x480,
+            0xF4 => 0x600,
+            0x80..=0x8F => 0x21060020,
+            0x90..=0x9F => 0x20060820,
+            0xA0..=0xBF => 0x860820,
+            0xC0..=0xC1 | 0xF5..=0xFF => 0x0,
         };
-        to[6] = match b {
-            0x90..=0xBF => 4,
-            _ => 0,
-        };
-        to[7] = match b {
-            0x80..=0x8F => 4,
-            _ => 0,
-        };
-        to[8] = match b {
-            0x80..=0xBF => 4,
-            _ => 0,
-        };
-
-        // On platforms without 64-bit shift, align states 5..10 to 32-bit boundary.
-        // See docs above for details.
-        let mut bits = 0u64;
-        let mut j = 0;
-        while j < to.len() {
-            let to_off =
-                to[j] * BITS_PER_STATE as u64 + if USE_SHIFT32 && to[j] >= 5 { 2 } else { 0 };
-            let off = j as u32 * BITS_PER_STATE + if USE_SHIFT32 && j >= 5 { 2 } else { 0 };
-            bits |= to_off << off;
-            j += 1;
-        }
-
-        let utf8_len = match b {
-            0x00..=0x7F => 1,
-            0xC2..=0xDF => 2,
-            0xE0..=0xEF => 3,
-            0xF0..=0xF4 => 4,
-            _ => 0,
-        };
-        bits |= utf8_len << (64 - UTF8_LEN_HIBITS);
-
-        table[b] = bits;
         b += 1;
     }
     table
 };
 
 #[inline(always)]
 const fn next_state(st: u32, byte: u8) -> u32 {
-    if USE_SHIFT32 {
-        // SAFETY: `u64` is more aligned than `u32`, and has the same repr as `[u32; 2]`.
-        let [lo, hi] = unsafe { mem::transmute::<u64, [u32; 2]>(TRANS_TABLE[byte as usize]) };
-        #[cfg(target_endian = "big")]
-        let (lo, hi) = (hi, lo);
-        if st & 32 == 0 { lo } else { hi }.wrapping_shr(st)
-    } else {
-        TRANS_TABLE[byte as usize].wrapping_shr(st as _) as _
-    }
+    TRANS_TABLE[byte as usize].wrapping_shr(st)
 }
 
 /// Check if `byte` is a valid UTF-8 first byte, assuming it must be a valid first or
@@ -407,13 +338,33 @@ fn run_utf8_validation_rt(bytes: &[u8]) -> Result<(), Utf8Error> {
     Ok(())
 }
 
+// https://tools.ietf.org/html/rfc3629
+const UTF8_CHAR_WIDTH: &[u8; 256] = &[
+    // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
+    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
+    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
+];
+
 /// Given a first byte, determines how many bytes are in this UTF-8 character.
 #[unstable(feature = "str_internals", issue = "none")]
 #[must_use]
 #[inline]
 pub const fn utf8_char_width(b: u8) -> usize {
-    // On 32-bit platforms, optimizer is smart enough to only load and operate on the high 32-bits.
-    (TRANS_TABLE[b as usize] >> (64 - UTF8_LEN_HIBITS)) as usize
+    UTF8_CHAR_WIDTH[b as usize] as usize
 }
 
 /// Mask of the value bits of a continuation byte.