diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index 4d182be02c9e9..3422b08db7736 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -1,7 +1,7 @@ use std::assert_matches::assert_matches; use std::borrow::Cow; use std::cmp::Ordering::{Equal, Greater, Less}; -use std::str::{from_utf8, from_utf8_unchecked}; +use std::str::{from_utf8, from_utf8_unchecked, Utf8Error}; #[test] fn test_le() { @@ -983,6 +983,250 @@ fn from_utf8_error() { test!(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(3)); } +const fn utf8_error_eq(e: &Utf8Error, valid: usize, err_len: Option) -> bool { + e.valid_up_to() == valid + && match (err_len, e.error_len()) { + (Some(a), Some(b)) => a == b, + (None, None) => true, + _ => false, + } +} + +#[test] +fn from_utf8_error_offset() { + const N: usize = if cfg!(miri) { 8 } else { 64 }; + + #[track_caller] + fn check(input: &[u8], valid: usize, err_len: Option) { + let mut buf = Vec::with_capacity(input.len() + N * N * N * 4); + for i in 0..N { + for j in 0..N { + buf.clear(); + buf.extend(core::iter::repeat(b'a').take(i)); + buf.extend(core::iter::repeat(*b"\xE6\x88\x91").take(i).flat_map(|n| n)); + let bump = buf.len(); + buf.extend_from_slice(input); + + assert!( + utf8_error_eq(&from_utf8(&buf).unwrap_err(), bump + valid, err_len), + "offset ({i}, {j}, _): on {input:?} ({buf:?})" + ); + for k in 0..N { + if k != 0 { + buf.extend(*b"\xD0\xB6") + } + let error = from_utf8(&buf).unwrap_err(); + let real_err = from_utf8(&buf).unwrap_err(); + assert_eq!( + error, real_err, + "(vs ref) offset ({i}, {j}, {k}): on {input:?} ({buf:?})" + ); + } + } + } + } + check(b"A\xC3\xA9 \xFF ", 4, Some(1)); + check(b"A\xC3\xA9 \x80 ", 4, Some(1)); + check(b"A\xC3\xA9 \xC1 ", 4, Some(1)); + check(b"A\xC3\xA9 \xC1", 4, Some(1)); + check(b"A\xC3\xA9 \xC2", 4, None); + check(b"A\xC3\xA9 \xC2 ", 4, Some(1)); + check(b"A\xC3\xA9 \xC2\xC0", 4, Some(1)); + check(b"A\xC3\xA9 \xE0", 4, None); + check(b"A\xC3\xA9 \xE0\x9F", 4, Some(1)); + check(b"A\xC3\xA9 \xE0\xA0", 4, None); + check(b"A\xC3\xA9 \xE0\xA0\xC0", 4, Some(2)); + check(b"A\xC3\xA9 \xE0\xA0 ", 4, Some(2)); + check(b"A\xC3\xA9 \xED\xA0\x80 ", 4, Some(1)); + check(b"A\xC3\xA9 \xF1", 4, None); + check(b"A\xC3\xA9 \xF1\x80", 4, None); + check(b"A\xC3\xA9 \xF1\x80\x80", 4, None); + check(b"A\xC3\xA9 \xF1 ", 4, Some(1)); + check(b"A\xC3\xA9 \xF1\x80 ", 4, Some(2)); + check(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(3)); + check(b"\xc3\x28", 0, Some(1)); + check(b"\xa0\xa1", 0, Some(1)); + check(b"\xe2\x28\xa1", 0, Some(1)); + check(b"\xe2\x82\x28", 0, Some(2)); + check(b"\xf0\x28\x8c\xbc", 0, Some(1)); + check(b"\xf0\x90\x28\xbc", 0, Some(2)); + check(b"\xf0\x28\x8c\x28", 0, Some(1)); + check(b"\xc0\x9f", 0, Some(1)); + check(b"\xf5\xff\xff\xff", 0, Some(1)); + check(b"\xed\xa0\x81", 0, Some(1)); + check(b"\xf8\x90\x80\x80\x80", 0, Some(1)); + check(b"123456789012345\xed", 15, None); + check(b"123456789012345\xf1", 15, None); + check(b"123456789012345\xc2", 15, None); + check(b"\xC2\x7F", 0, Some(1)); + check(b"\xce", 0, None); + check(b"\xce\xba\xe1", 2, None); + check(b"\xce\xba\xe1\xbd", 2, None); + check(b"\xce\xba\xe1\xbd\xb9\xcf", 5, None); + check(b"\xce\xba\xe1\xbd\xb9\xcf\x83\xce", 7, None); + check(b"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce", 9, None); + check(b"\xdf", 0, None); + check(b"\xef\xbf", 0, None); + check(b"\x80", 0, Some(1)); + check(b"\x91\x85\x95\x9e", 0, Some(1)); + check(b"\x6c\x02\x8e\x18", 2, Some(1)); + check(b"\xFF", 0, Some(1)); + check(b"a\xFF", 1, Some(1)); + check(b"\xCE\xB2\xFF", 2, Some(1)); + check(b"\xE2\x98\x83\xFF", 3, Some(1)); + check(b"\xF0\x9D\x9D\xB1\xFF", 4, Some(1)); + check(b"\xCE\xF0", 0, Some(1)); + check(b"\xE2\x98\xF0", 0, Some(2)); + check(b"\xF0\x9D\x9D\xF0", 0, Some(3)); + check(b"\xF0\x82\x82\xAC", 0, Some(1)); + check(b"a\xF0\x82\x82\xAC", 1, Some(1)); + check(b"\xE2\x98\x83\xF0\x82\x82\xAC", 3, Some(1)); + check(b"\xED\xA0\x80", 0, Some(1)); + check(b"\xE2\x98\x83\xED\xA0\x80", 3, Some(1)); + check(b"\xE2\x98\x83\xCE\xE2\x98\x83", 3, Some(1)); + check(b"\xCEa", 0, Some(1)); + check(b"a\xCEa", 1, Some(1)); + check(b"\xE2\x98\x83\xE2\x98\xE2\x98\x83", 3, Some(2)); + // check(b"\xF0\x9D\x9Ca", 3, Some(2)); + check(b"\xE2\x98a", 0, Some(2)); + check(b"a\xE2\x98a", 1, Some(2)); + check(b"\xF0\x9D\x9Ca", 0, Some(3)); + check(b"a\xF0\x9D\x9Ca", 1, Some(3)); + check(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83", 4, Some(3)); + check(b"foobar\xF1\x80\x80quux", 6, Some(3)); + check(b"\xCE", 0, None); + check(b"a\xCE", 1, None); + check(b"\xE2\x98\x83\xCE", 3, None); + check(b"\xE2\x98", 0, None); + check(b"a\xE2\x98", 1, None); + check(b"\xE2\x98\x83\xE2\x98", 3, None); + check(b"\xF0\x9D\x9C", 0, None); + check(b"a\xF0\x9D\x9C", 1, None); + check(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C", 4, None); + check(b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF", 8, Some(1)); +} + +#[test] +fn utf8_error_cases_const() { + macro_rules! expect_utf8_error { + ($bytes:expr, $valid:expr, $elen:expr $(,)?) => {{ + assert!(utf8_error_eq(&from_utf8($bytes).unwrap_err(), $valid, $elen)); + const _: () = match from_utf8($bytes) { + Ok(_) => panic!(concat!("shouldn't pass: ", stringify!($bytes))), + Err(e) => assert!(utf8_error_eq(&e, $valid, $elen)), + }; + }}; + } + expect_utf8_error!(b"\xc3\x28", 0, Some(1)); + expect_utf8_error!(b"\xa0\xa1", 0, Some(1)); + expect_utf8_error!(b"\xe2\x28\xa1", 0, Some(1)); + expect_utf8_error!(b"\xe2\x82\x28", 0, Some(2)); + expect_utf8_error!(b"\xf0\x28\x8c\xbc", 0, Some(1)); + expect_utf8_error!(b"\xf0\x90\x28\xbc", 0, Some(2)); + expect_utf8_error!(b"\xf0\x28\x8c\x28", 0, Some(1)); + expect_utf8_error!(b"\xc0\x9f", 0, Some(1)); + expect_utf8_error!(b"\xf5\xff\xff\xff", 0, Some(1)); + expect_utf8_error!(b"\xed\xa0\x81", 0, Some(1)); + expect_utf8_error!(b"\xf8\x90\x80\x80\x80", 0, Some(1)); + expect_utf8_error!(b"123456789012345\xed", 15, None); + expect_utf8_error!(b"123456789012345\xf1", 15, None); + expect_utf8_error!(b"123456789012345\xc2", 15, None); + expect_utf8_error!(b"\xC2\x7F", 0, Some(1)); + expect_utf8_error!(b"\xce", 0, None); + expect_utf8_error!(b"\xce\xba\xe1", 2, None); + expect_utf8_error!(b"\xce\xba\xe1\xbd", 2, None); + expect_utf8_error!(b"\xce\xba\xe1\xbd\xb9\xcf", 5, None); + expect_utf8_error!(b"\xce\xba\xe1\xbd\xb9\xcf\x83\xce", 7, None); + expect_utf8_error!(b"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce", 9, None); + expect_utf8_error!(b"\xdf", 0, None); + expect_utf8_error!(b"\xef\xbf", 0, None); + expect_utf8_error!(b"\x80", 0, Some(1)); + expect_utf8_error!(b"\x91\x85\x95\x9e", 0, Some(1)); + expect_utf8_error!(b"\x6c\x02\x8e\x18", 2, Some(1)); + expect_utf8_error!( + &[ + 0x25, 0x5b, 0x6e, 0x2c, 0x32, 0x2c, 0x5b, 0x5b, 0x33, 0x2c, 0x34, 0x2c, 0x05, 0x29, + 0x2c, 0x33, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x5b, 0x5b, + 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, + 0x5b, 0x5d, 0x2c, 0x35, 0x2e, 0x33, 0x2c, 0x39, 0x2e, 0x33, 0x2c, 0x37, 0x2e, 0x33, + 0x2c, 0x39, 0x2e, 0x34, 0x2c, 0x37, 0x2e, 0x33, 0x2c, 0x39, 0x2e, 0x33, 0x2c, 0x37, + 0x2e, 0x33, 0x2c, 0x39, 0x2e, 0x34, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, + 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x20, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x02, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x23, 0x0a, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x7e, 0x7e, 0x0a, 0x0a, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, + 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5d, 0x2c, 0x37, 0x2e, 0x33, 0x2c, 0x39, 0x2e, 0x33, + 0x2c, 0x37, 0x2e, 0x33, 0x2c, 0x39, 0x2e, 0x34, 0x2c, 0x37, 0x2e, 0x33, 0x2c, 0x39, + 0x2e, 0x33, 0x2c, 0x37, 0x2e, 0x33, 0x2c, 0x39, 0x2e, 0x34, 0x5d, 0x5d, 0x5d, 0x5d, + 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x5d, 0x01, 0x01, 0x80, + 0x01, 0x01, 0x01, 0x79, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, + ], + 335, + Some(1), + ); + expect_utf8_error!( + &[ + 0x5bu8, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, + 0x5b, 0x80, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, b'0', 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, + ], + 15, + Some(1), + ); + expect_utf8_error!( + &[ + 0x20, 0x0b, 0x01, 0x01, 0x01, 0x64, 0x3a, 0x64, 0x3a, 0x64, 0x3a, 0x5b, 0x5b, 0x5b, + 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, + 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, + 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x5b, 0x30, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x80, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01u8, + ], + 172, + Some(1), + ); +} + #[test] fn test_as_bytes() { // no null @@ -996,6 +1240,29 @@ fn test_as_bytes() { assert_eq!("ศไทย中华Việt Nam".as_bytes(), v); } +#[test] +#[cfg(not(miri))] +fn from_utf8_all_chars() { + for i in 0..(0x10FFFF + 1) { + let Some(cp) = char::from_u32(i) else { + continue; + }; + let mut buf = [0; 4]; + let s: &str = cp.encode_utf8(&mut buf); + assert_eq!(Ok(s), from_utf8(s.as_bytes())); + } +} + +#[test] +fn test_multi() { + assert!(from_utf8(b"abc").is_ok()); + assert!(from_utf8(b"a\xE2\x98\x83a").is_ok()); + assert!(from_utf8(b"a\xF0\x9D\x9C\xB7a").is_ok()); + assert!(from_utf8(b"\xE2\x98\x83\xF0\x9D\x9C\xB7").is_ok()); + assert!(from_utf8(b"a\xE2\x98\x83a\xF0\x9D\x9C\xB7a").is_ok()); + assert!(from_utf8(b"\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD").is_ok()); +} + #[test] #[should_panic] fn test_as_bytes_fail() { diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs index 2acef432f2063..292d5a12c4542 100644 --- a/library/core/src/str/validations.rs +++ b/library/core/src/str/validations.rs @@ -1,6 +1,6 @@ //! Operations related to UTF-8 validation. - -use crate::mem; +mod utf8_dfa; +pub(super) use utf8_dfa::run_utf8_validation; use super::Utf8Error; @@ -112,135 +112,6 @@ where Some(ch) } -const NONASCII_MASK: usize = usize::repeat_u8(0x80); - -/// Returns `true` if any byte in the word `x` is nonascii (>= 128). -#[inline] -const fn contains_nonascii(x: usize) -> bool { - (x & NONASCII_MASK) != 0 -} - -/// Walks through `v` checking that it's a valid UTF-8 sequence, -/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`. -#[inline(always)] -#[rustc_const_unstable(feature = "str_internals", issue = "none")] -pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { - let mut index = 0; - let len = v.len(); - - let usize_bytes = mem::size_of::(); - let ascii_block_size = 2 * usize_bytes; - let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 }; - let align = v.as_ptr().align_offset(usize_bytes); - - while index < len { - let old_offset = index; - macro_rules! err { - ($error_len: expr) => { - return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len }) - }; - } - - macro_rules! next { - () => {{ - index += 1; - // we needed data, but there was none: error! - if index >= len { - err!(None) - } - v[index] - }}; - } - - let first = v[index]; - if first >= 128 { - let w = utf8_char_width(first); - // 2-byte encoding is for codepoints \u{0080} to \u{07ff} - // first C2 80 last DF BF - // 3-byte encoding is for codepoints \u{0800} to \u{ffff} - // first E0 A0 80 last EF BF BF - // excluding surrogates codepoints \u{d800} to \u{dfff} - // ED A0 80 to ED BF BF - // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff - // first F0 90 80 80 last F4 8F BF BF - // - // Use the UTF-8 syntax from the RFC - // - // https://tools.ietf.org/html/rfc3629 - // UTF8-1 = %x00-7F - // UTF8-2 = %xC2-DF UTF8-tail - // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / - // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) - // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / - // %xF4 %x80-8F 2( UTF8-tail ) - match w { - 2 => { - if next!() as i8 >= -64 { - err!(Some(1)) - } - } - 3 => { - match (first, next!()) { - (0xE0, 0xA0..=0xBF) - | (0xE1..=0xEC, 0x80..=0xBF) - | (0xED, 0x80..=0x9F) - | (0xEE..=0xEF, 0x80..=0xBF) => {} - _ => err!(Some(1)), - } - if next!() as i8 >= -64 { - err!(Some(2)) - } - } - 4 => { - match (first, next!()) { - (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {} - _ => err!(Some(1)), - } - if next!() as i8 >= -64 { - err!(Some(2)) - } - if next!() as i8 >= -64 { - err!(Some(3)) - } - } - _ => err!(Some(1)), - } - index += 1; - } else { - // Ascii case, try to skip forward quickly. - // When the pointer is aligned, read 2 words of data per iteration - // until we find a word containing a non-ascii byte. - if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 { - let ptr = v.as_ptr(); - while index < blocks_end { - // SAFETY: since `align - index` and `ascii_block_size` are - // multiples of `usize_bytes`, `block = ptr.add(index)` is - // always aligned with a `usize` so it's safe to dereference - // both `block` and `block.add(1)`. - unsafe { - let block = ptr.add(index) as *const usize; - // break if there is a nonascii byte - let zu = contains_nonascii(*block); - let zv = contains_nonascii(*block.add(1)); - if zu || zv { - break; - } - } - index += ascii_block_size; - } - // step from the point where the wordwise loop stopped - while index < len && v[index] < 128 { - index += 1; - } - } else { - index += 1; - } - } - } - - Ok(()) -} - // https://tools.ietf.org/html/rfc3629 const UTF8_CHAR_WIDTH: &[u8; 256] = &[ // 1 2 3 4 5 6 7 8 9 A B C D E F diff --git a/library/core/src/str/validations/utf8_dfa.rs b/library/core/src/str/validations/utf8_dfa.rs new file mode 100644 index 0000000000000..8fa219f1eae22 --- /dev/null +++ b/library/core/src/str/validations/utf8_dfa.rs @@ -0,0 +1,324 @@ +//! FIXME docs/writeup/etc. +use super::Utf8Error; +use core::intrinsics::{const_eval_select, likely, unlikely}; + +/// Transition table for the Shift-DFA. +// Align to an cache line boundary to reduce the cache footprint. This is just a +// rough approximation, and doesn't really need to be perfect. +#[cfg_attr(target_pointer_width = "64", repr(C, align(128)))] +#[cfg_attr(target_pointer_width = "32", repr(C, align(64)))] +struct DfaTransitions([u32; 256]); + +// State IDs we need to reference in the code. These and the transition table +// were generated by small program that drives a solver (FIXME link), which is +// why we're able to use a 32 bit rows rather than the traditional 64 bit. +const ERR: u32 = 0; +const END: u32 = 17; + +#[rustfmt::skip] +const DFA: &DfaTransitions = { + const ILL: u32 = 0b00000000000000000000000000000000; + const X00: u32 = 0b00000000001000100000000000000000; + const XC2: u32 = 0b00000000000011000000000000000000; + const XE0: u32 = 0b00000000000110000000000000000000; + const XE1: u32 = 0b00000000000010000000000000000000; + const XED: u32 = 0b00000000001011000000000000000000; + const XF0: u32 = 0b00000000001100100000000000000000; + const XF1: u32 = 0b00000000000100000000000000000000; + const XF4: u32 = 0b00000000001110000000000000000000; + const X80: u32 = 0b01000001100000000000010001100000; + const X90: u32 = 0b00001001100000000000010001100000; + const XA0: u32 = 0b00001000000000000110010001100000; + + &DfaTransitions([ + X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, + X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, + + X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, + X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, + + X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, + X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, + + X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, + X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, X00, + + X80, X80, X80, X80, X80, X80, X80, X80, X80, X80, X80, X80, X80, X80, X80, X80, + X90, X90, X90, X90, X90, X90, X90, X90, X90, X90, X90, X90, X90, X90, X90, X90, + + XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, + XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, XA0, + + ILL, ILL, XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, + XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, XC2, + + XE0, XE1, XE1, XE1, XE1, XE1, XE1, XE1, XE1, XE1, XE1, XE1, XE1, XED, XE1, XE1, + XF0, XF1, XF1, XF1, XF4, ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL, + ]) +}; + +// Use a generic to help the compiler out some — we pass both `&[u8]` and `&[u8; +// CHUNK_LEN]` in here, and would like it to know about the constant. +#[must_use] +#[inline] +fn dfa_run(mut state: u32, chunk: impl AsRef<[u8]>) -> u32 { + for &byte in chunk.as_ref() { + state = (DFA.0[byte as usize] as u32) >> (state & 31); + } + state & 31 +} + +// advance the DFA a single step, returning the new masked state. If you have a +// slice you should use `dfa_run` instead; it's usually more efficient. +#[must_use] +#[inline(always)] +const fn dfa_step(state: u32, byte: u8) -> u32 { + ((DFA.0[byte as usize] as u32) >> (state & 31)) & 31 +} + +// Note: not trivial to change. +const CHUNK_LEN: usize = 16; + +/// Walks through `v` checking that it's a valid UTF-8 sequence, +/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`. +#[inline] +#[rustc_const_unstable(feature = "str_internals", issue = "none")] +pub(crate) const fn run_utf8_validation(inp: &[u8]) -> Result<(), Utf8Error> { + const fn validate_utf8_const(inp: &[u8]) -> Result<(), Utf8Error> { + validate_carefully(inp, 0) + } + + #[inline] + fn validate_utf8_rt(inp: &[u8]) -> Result<(), Utf8Error> { + match inp.len() { + 0 => Ok(()), + 1..=CHUNK_LEN => { + if is_ascii_small(inp) { + Ok(()) + } else { + validate_carefully(inp, 0) + } + } + _ => validate_utf8_impl(inp), + } + } + + // SAFETY: These are equivalent aside from optimizations. + unsafe { const_eval_select((inp,), validate_utf8_const, validate_utf8_rt) } +} + +/// Validation code which isn't particularly optimized, but can produce an +/// accurate `Utf8Error` (and works in `const`), but +#[inline] +const fn validate_carefully(input: &[u8], mut pos: usize) -> Result<(), Utf8Error> { + let mut state = END; + let mut valid_up_to = pos; + while pos < input.len() { + state = dfa_step(state, input[pos]); + match state { + END => valid_up_to = pos + 1, + ERR => { + debug_assert!(pos >= valid_up_to && pos - valid_up_to <= 3); + let error_len = + Some(if pos == valid_up_to { 1 } else { (pos - valid_up_to) as u8 }); + return Err(Utf8Error { valid_up_to, error_len }); + } + // Keep going + _ => {} + } + pos += 1; + } + if state != END { Err(Utf8Error { valid_up_to, error_len: None }) } else { Ok(()) } +} + +#[inline] +fn validate_utf8_impl(inp: &[u8]) -> Result<(), Utf8Error> { + let mut state = END; + let mut pos = 0; + let (chunks, tail) = inp.as_chunks::(); + if !chunks.is_empty() { + let mut last_state = state; + let mut chunk_iter = chunks.iter(); + while let Some(chunk) = chunk_iter.next() { + if state == END && all_ascii_chunk(chunk) { + let skipped = skip_ascii_chunks(&mut chunk_iter); + pos += skipped * CHUNK_LEN; + } else { + state = dfa_run(state, chunk); + if unlikely(state == ERR) { + break; + } + last_state = state; + } + pos += core::mem::size_of_val(chunk); + } + if unlikely(state == ERR) { + return Err(utf8_find_error(inp, pos, last_state != END)); + } + } + // Did we leave the optimized loop in the middle of a UTF-8 sequence? + let was_mid_char = state != END; + debug_assert!(state != ERR); + if !tail.is_empty() { + // Check and early return if the last CHUNK_LEN bytes were all ASCII. The + // motivation here is to avoid bringing the DFA table into the cache for + // pure ASCII. + // + // 1. avoid touching the DFA table for pure-ASCII input. + // 2. add a branch into the `dfa_run` inner loop. + // + // So we check and see if the last CHUNK_LEN bytes were all ASCII. This does + // compare with a few bytes that we've already processed, but handling + // that is not required for correctness (and doing so seems ot hurt + // performance) + if !was_mid_char && inp.len() >= CHUNK_LEN && tail.len() < CHUNK_LEN { + use crate::convert::TryFrom; + let range = (inp.len() - CHUNK_LEN)..inp.len(); + debug_assert!(range.contains(&pos), "{:?}", (range, pos)); + if all_ascii_chunk(<&[u8; CHUNK_LEN]>::try_from(&inp[range]).unwrap()) { + return Ok(()); + } + } + + state = dfa_run(state, tail); + } + + if likely(state == END) { + return Ok(()); + } + let (index, backup) = + if state == ERR { (inp.len() - tail.len(), was_mid_char) } else { (inp.len(), true) }; + Err(utf8_find_error(inp, index, backup)) +} + +#[inline] +fn backup_not_yet_invalid(inp: &[u8], mut pos: usize) -> usize { + debug_assert!(!inp.is_empty() && inp.get(..pos).is_some()); + while pos != 0 { + pos -= 1; + let is_cont = (inp[pos] & 0b1100_0000) == 0b1000_0000; + if !is_cont { + break; + } + } + pos +} + +#[cold] +fn utf8_find_error(input: &[u8], mut pos: usize, backup: bool) -> Utf8Error { + debug_assert!(!input.is_empty()); + if backup { + pos = backup_not_yet_invalid(input, pos); + } + validate_carefully(input, pos).unwrap_err() +} + +#[inline] +fn skip_ascii_chunks(s: &mut core::slice::Iter<'_, [u8; CHUNK_LEN]>) -> usize { + let mut i = 0; + let initial_slice = s.as_slice(); + while let Some(c) = s.next() { + if !all_ascii_chunk(c) { + break; + } + i += 1; + } + *s = initial_slice[i..].iter(); + i +} + +#[inline] +fn all_ascii_chunk(s: &[u8; CHUNK_LEN]) -> bool { + // Sadly, `core::simd` currently does not compile very efficiently on some + // targets (all the targets without simd, and some of the targets with it). + // + // It's also somewhat untested on others, so out of an abundance of caution + // we avoid it on any target that isn't both: + // - Known to support it efficiently. + // - Actually something we'd use and test on in the versions of libcore we + // ship. + const SIMD_ASCII_TEST: bool = cfg!(any( + all(any(target_arch = "x86_64", target_arch = "x86"), target_feature = "sse2",), + all(target_arch = "aarch64", target_feature = "neon"), + )); + + if SIMD_ASCII_TEST { + use crate::simd::*; + // Workaround for :( + let simd_chunk = Simd::::from_array(*s); + if cfg!(target_arch = "aarch64") { + simd_chunk.reduce_max() < 0x80 + } else { + const ALL_HI: Simd = Simd::from_array([0x80; CHUNK_LEN]); + const ZERO: Simd = Simd::from_array([0; CHUNK_LEN]); + (simd_chunk & ALL_HI).simd_eq(ZERO).all() + } + } else { + // On targets where `core::simd` doesn't compile to efficient code we + // manually do the equivalent using u64-based SWAR using u64. Using u64 and + // not `usize` here seems better on 32 bit which have 64 bit register + // access, but ends up just being an extra unroll step on ones which don't + // (so no worse, and possibly still better). + type SwarWord = u64; + const WORD_BYTES: usize = core::mem::size_of::(); + const _: () = assert!((CHUNK_LEN % WORD_BYTES) == 0 && CHUNK_LEN != 0); + let (arr, rem) = s.as_chunks::(); + debug_assert!(rem.is_empty() && !arr.is_empty()); + let mut combined = 0; + for word_bytes in arr { + combined |= SwarWord::from_ne_bytes(*word_bytes); + } + const ALL_HI: SwarWord = SwarWord::from_ne_bytes([0x80; WORD_BYTES]); + (combined & ALL_HI) == 0 + } +} + +#[inline] +fn is_ascii_small(s: &[u8]) -> bool { + // LLVM seems to get pretty aggressive if we use a loop here, even if we + // check the length first, which can end up having some pretty disasterous + // impacts on performance, seemingly due to inlining(?). In any case. + // + // Note that doing this for many sizes + // + // It ends up causing performance problems (probably due to lower + // willingness to inline. Instead of that, we handle a small number of + // len-ranges by doing reads that intentionally overlap for some of the + // slice lengths. Note that going overboard here will result in branch + // prediction issues, so this is intentionally minimal -- just enough to + // handle lengths up to `CHUNK_LEN` without pain + match s.len() { + // Actually handled in caller, + 0 => true, + 1..=3 => { + // Note: If `a`, `b`, and `c` are all ASCII bytes, then `a | b | c` + // will be too. + let all_bytes_ored = s[0] | s[s.len() / 2] | s[s.len() - 1]; + (all_bytes_ored & 0x80) == 0 + } + 4..=16 => { + // SAFETY: `off..(off + 4)` should be in bounds for `n`. + #[inline(always)] + unsafe fn read32_unchecked(n: &[u8], off: usize) -> u32 { + debug_assert!(n.get(off..(off + core::mem::size_of::())).is_some()); + // SAFETY: requirements passed on to caller + unsafe { n.as_ptr().add(off).cast::().read_unaligned() } + } + // SAFETY: All these reads are guaranteed to be in-bounds for all + // `s.len()` values in the between 4..=16 range. Sadly, the compiler + // doesn't seem to be able to remove bounds checks on expressions + // involving `mid_round_down` (no matter how I phrase it), so we need + // the unsafe. + let all_u32_ored = unsafe { + let mid_round_down = (s.len() / 2) & !3; + let tail = s.len() - 4; + read32_unchecked(s, 0) + | read32_unchecked(s, mid_round_down) + | read32_unchecked(s, tail - mid_round_down) + | read32_unchecked(s, tail) + }; + (all_u32_ored & 0x80808080) == 0 + } + _ => false, + } +}