Skip to content
This repository was archived by the owner on May 28, 2025. It is now read-only.

Commit 33c076e

Browse files
committed
Inspect the first byte before skipping ASCII chunk
To reduce the chance getting into ASCII bypass but without making any progress, thus reduce branch flipping.
1 parent 0056ad2 commit 33c076e

File tree

1 file changed

+4
-1
lines changed

1 file changed

+4
-1
lines changed

library/core/src/str/validations.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,10 @@ fn run_utf8_validation_rt(bytes: &[u8]) -> Result<(), Utf8Error> {
356356

357357
while i + MAIN_CHUNK_SIZE <= bytes.len() {
358358
// Fast path: if the current state is ACCEPT, we can skip to the next non-ASCII chunk.
359-
if st == ST_ACCEPT {
359+
// We also did a quick inspection on the first byte to avoid getting into this path at all
360+
// when handling strings with almost no ASCII, eg. Chinese scripts.
361+
// SAFETY: `i` is inbound.
362+
if st == ST_ACCEPT && unsafe { *bytes.get_unchecked(i) } < 0x80 {
360363
// SAFETY: `i` is inbound.
361364
let rest = unsafe { bytes.get_unchecked(i..) };
362365
let mut ascii_chunks = rest.array_chunks::<ASCII_CHUNK_SIZE>();

0 commit comments

Comments
 (0)