Skip to content

char_indices is slower than while loop over ascii Chars, but faster for unicode #141855

Open
@hkBst

Description

@hkBst

I tried this code:

use std::ops::Range;

#[derive(Debug, PartialEq, Eq)]
pub enum EscapeError {
    /// Raw '\r' encountered.
    BareCarriageReturn,
    /// Raw '\r' encountered in raw string.
    BareCarriageReturnInRawString,
}

pub fn check_raw_str_while(
    src: &str,
    mut callback: impl FnMut(Range<usize>, Result<char, EscapeError>),
) {
    let mut chars = src.chars();

    while let Some(c) = chars.next() {
        let start = src.len() - chars.as_str().len() - c.len_utf8();
        let res = match c {
            '\r' => Err(EscapeError::BareCarriageReturn),
            _ => Ok(c),
        };
        let end = src.len() - chars.as_str().len();
        callback(start..end, res);
    }
}

pub fn check_raw_str_char_indices(
    src: &str,
    mut callback: impl FnMut(Range<usize>, Result<char, EscapeError>),
) {
    src.char_indices().for_each(|(pos, c)| {
        callback(
            pos..pos + c.len_utf8(),
            if c == '\r' {
                Err(EscapeError::BareCarriageReturn)
            } else {
                Ok(c)
            },
        );
    });
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn same() {
        let s = "abcdefghijklmnopqrstuvwxyz0123456789";
        let (mut r1, mut r2) = (vec![], vec![]);
        check_raw_str_while(&s, |r, c| r1.push((r, c)));
        check_raw_str_char_indices(&s, |r, c| r2.push((r, c)));
        assert_eq!(r1, r2);
    }
}

with these benches:

#![feature(test)]

extern crate test;

use bench_char_indices::*;

use std::iter::repeat_n;
const LEN: usize = 10_000;

macro_rules! fn_bench_check_raw {
    ($name:ident, $unit:ty, $check_raw:ident) => {
        fn $name(b: &mut test::Bencher, s: &str, expected: $unit) {
            let input: String = test::black_box(repeat_n(s, LEN).collect());
            assert_eq!(input.len(), LEN * s.len());
            b.iter(|| {
                let mut output = vec![];

                $check_raw(&input, |range, res| output.push((range, res)));
                assert_eq!(output.len(), LEN);
                assert_eq!(output[0], ((0..s.len()), Ok(expected)));
            });
        }
    };
}

fn_bench_check_raw!(bench_check_raw_str_while, char, check_raw_str_while);
fn_bench_check_raw!(
    bench_check_raw_str_char_indices,
    char,
    check_raw_str_char_indices
);

#[bench]
fn bench_check_raw_str_ascii_while(b: &mut test::Bencher) {
    bench_check_raw_str_while(b, "a", 'a');
}

#[bench]
fn bench_check_raw_str_ascii_char_indices(b: &mut test::Bencher) {
    bench_check_raw_str_char_indices(b, "a", 'a');
}

I expected to see this happen: NO performance difference

Instead, there is a more than 10% difference:

test bench_check_raw_str_ascii_char_indices ... bench:      27,733.96 ns/iter (+/- 103.24)
test bench_check_raw_str_ascii_while        ... bench:      24,525.01 ns/iter (+/- 582.28)

Tested on: rustc 1.88.0-nightly (2e6882a 2025-05-05)

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-UnicodeArea: UnicodeC-optimizationCategory: An issue highlighting optimization opportunities or PRs implementing suchT-compilerRelevant to the compiler team, which will review and decide on the PR/issue.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions