new lint: char_indices_as_byte_indices (rust-lang#13435)

Centri3 · web-flow · commit 4f58673f8d14 · 2025-03-30T17:28:17.000Z
Closes rust-lang#10202. This adds a new lint that checks for uses of the `.chars().enumerate()` position in a context where a byte index is required and suggests changing it to use `.char_indices()` instead. I'm planning to extend this lint to also detect uses of the position in iterator chains, e.g. `s.chars().enumerate().for_each(|(i, _)| s.split_at(i));`, but that's for another time ---------------- changelog: new lint: `chars_enumerate_for_byte_indices`
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5516,6 +5516,7 @@ Released 2018-09-13
 [`cast_slice_different_sizes`]: https://rust-lang.github.io/rust-clippy/master/index.html#cast_slice_different_sizes
 [`cast_slice_from_raw_parts`]: https://rust-lang.github.io/rust-clippy/master/index.html#cast_slice_from_raw_parts
 [`cfg_not_test`]: https://rust-lang.github.io/rust-clippy/master/index.html#cfg_not_test
+[`char_indices_as_byte_indices`]: https://rust-lang.github.io/rust-clippy/master/index.html#char_indices_as_byte_indices
 [`char_lit_as_u8`]: https://rust-lang.github.io/rust-clippy/master/index.html#char_lit_as_u8
 [`chars_last_cmp`]: https://rust-lang.github.io/rust-clippy/master/index.html#chars_last_cmp
 [`chars_next_cmp`]: https://rust-lang.github.io/rust-clippy/master/index.html#chars_next_cmp
diff --git a/clippy_lints/src/declared_lints.rs b/clippy_lints/src/declared_lints.rs
@@ -287,6 +287,7 @@ pub static LINTS: &[&crate::LintInfo] = &[
     crate::literal_representation::UNREADABLE_LITERAL_INFO,
     crate::literal_representation::UNUSUAL_BYTE_GROUPINGS_INFO,
     crate::literal_string_with_formatting_args::LITERAL_STRING_WITH_FORMATTING_ARGS_INFO,
+    crate::loops::CHAR_INDICES_AS_BYTE_INDICES_INFO,
     crate::loops::EMPTY_LOOP_INFO,
     crate::loops::EXPLICIT_COUNTER_LOOP_INFO,
     crate::loops::EXPLICIT_INTO_ITER_LOOP_INFO,
diff --git a/clippy_lints/src/loops/char_indices_as_byte_indices.rs b/clippy_lints/src/loops/char_indices_as_byte_indices.rs
@@ -0,0 +1,141 @@
+use std::ops::ControlFlow;
+
+use clippy_utils::diagnostics::span_lint_hir_and_then;
+use clippy_utils::ty::is_type_lang_item;
+use clippy_utils::visitors::for_each_expr;
+use clippy_utils::{eq_expr_value, higher, path_to_local_id};
+use rustc_errors::{Applicability, MultiSpan};
+use rustc_hir::{Expr, ExprKind, LangItem, Node, Pat, PatKind};
+use rustc_lint::LateContext;
+use rustc_middle::ty::Ty;
+use rustc_span::{Span, sym};
+
+use super::CHAR_INDICES_AS_BYTE_INDICES;
+
+// The list of `str` methods we want to lint that have a `usize` argument representing a byte index.
+// Note: `String` also has methods that work with byte indices,
+// but they all take `&mut self` and aren't worth considering since the user couldn't have called
+// them while the chars iterator is live anyway.
+const BYTE_INDEX_METHODS: &[&str] = &[
+    "is_char_boundary",
+    "floor_char_boundary",
+    "ceil_char_boundary",
+    "get",
+    "index",
+    "index_mut",
+    "get_mut",
+    "get_unchecked",
+    "get_unchecked_mut",
+    "slice_unchecked",
+    "slice_mut_unchecked",
+    "split_at",
+    "split_at_mut",
+    "split_at_checked",
+    "split_at_mut_checked",
+];
+
+const CONTINUE: ControlFlow<!, ()> = ControlFlow::Continue(());
+
+pub(super) fn check<'tcx>(cx: &LateContext<'tcx>, pat: &Pat<'_>, iterable: &Expr<'_>, body: &'tcx Expr<'tcx>) {
+    if let ExprKind::MethodCall(_, enumerate_recv, _, enumerate_span) = iterable.kind
+        && let Some(method_id) = cx.typeck_results().type_dependent_def_id(iterable.hir_id)
+        && cx.tcx.is_diagnostic_item(sym::enumerate_method, method_id)
+        && let ExprKind::MethodCall(_, chars_recv, _, chars_span) = enumerate_recv.kind
+        && let Some(method_id) = cx.typeck_results().type_dependent_def_id(enumerate_recv.hir_id)
+        && cx.tcx.is_diagnostic_item(sym::str_chars, method_id)
+    {
+        if let PatKind::Tuple([pat, _], _) = pat.kind
+            && let PatKind::Binding(_, binding_id, ..) = pat.kind
+        {
+            // Destructured iterator element `(idx, _)`, look for uses of the binding
+            for_each_expr(cx, body, |expr| {
+                if path_to_local_id(expr, binding_id) {
+                    check_index_usage(cx, expr, pat, enumerate_span, chars_span, chars_recv);
+                }
+                CONTINUE
+            });
+        } else if let PatKind::Binding(_, binding_id, ..) = pat.kind {
+            // Bound as a tuple, look for `tup.0`
+            for_each_expr(cx, body, |expr| {
+                if let ExprKind::Field(e, field) = expr.kind
+                    && path_to_local_id(e, binding_id)
+                    && field.name == sym::integer(0)
+                {
+                    check_index_usage(cx, expr, pat, enumerate_span, chars_span, chars_recv);
+                }
+                CONTINUE
+            });
+        }
+    }
+}
+
+fn check_index_usage<'tcx>(
+    cx: &LateContext<'tcx>,
+    expr: &'tcx Expr<'tcx>,
+    pat: &Pat<'_>,
+    enumerate_span: Span,
+    chars_span: Span,
+    chars_recv: &Expr<'_>,
+) {
+    let Some(parent_expr) = index_consumed_at(cx, expr) else {
+        return;
+    };
+
+    let is_string_like = |ty: Ty<'_>| ty.is_str() || is_type_lang_item(cx, ty, LangItem::String);
+    let message = match parent_expr.kind {
+        ExprKind::MethodCall(segment, recv, ..)
+            // We currently only lint `str` methods (which `String` can deref to), so a `.is_str()` check is sufficient here
+            // (contrary to the `ExprKind::Index` case which needs to handle both with `is_string_like` because `String` implements
+            // `Index` directly and no deref to `str` would happen in that case).
+            if cx.typeck_results().expr_ty_adjusted(recv).peel_refs().is_str()
+                && BYTE_INDEX_METHODS.contains(&segment.ident.name.as_str())
+                && eq_expr_value(cx, chars_recv, recv) =>
+        {
+            "passing a character position to a method that expects a byte index"
+        },
+        ExprKind::Index(target, ..)
+            if is_string_like(cx.typeck_results().expr_ty_adjusted(target).peel_refs())
+                && eq_expr_value(cx, chars_recv, target) =>
+        {
+            "indexing into a string with a character position where a byte index is expected"
+        },
+        _ => return,
+    };
+
+    span_lint_hir_and_then(
+        cx,
+        CHAR_INDICES_AS_BYTE_INDICES,
+        expr.hir_id,
+        expr.span,
+        message,
+        |diag| {
+            diag.note("a character can take up more than one byte, so they are not interchangeable")
+                .span_note(
+                    MultiSpan::from_spans(vec![pat.span, enumerate_span]),
+                    "position comes from the enumerate iterator",
+                )
+                .span_suggestion_verbose(
+                    chars_span.to(enumerate_span),
+                    "consider using `.char_indices()` instead",
+                    "char_indices()",
+                    Applicability::MaybeIncorrect,
+                );
+        },
+    );
+}
+
+/// Returns the expression which ultimately consumes the index.
+/// This is usually the parent expression, i.e. `.split_at(idx)` for `idx`,
+/// but for `.get(..idx)` we want to consider the method call the consuming expression,
+/// which requires skipping past the range expression.
+fn index_consumed_at<'tcx>(cx: &LateContext<'tcx>, expr: &'tcx Expr<'tcx>) -> Option<&'tcx Expr<'tcx>> {
+    for (_, node) in cx.tcx.hir_parent_iter(expr.hir_id) {
+        match node {
+            Node::Expr(expr) if higher::Range::hir(expr).is_some() => {},
+            Node::ExprField(_) => {},
+            Node::Expr(expr) => return Some(expr),
+            _ => break,
+        }
+    }
+    None
+}
diff --git a/clippy_lints/src/loops/mod.rs b/clippy_lints/src/loops/mod.rs
@@ -1,3 +1,4 @@
+mod char_indices_as_byte_indices;
 mod empty_loop;
 mod explicit_counter_loop;
 mod explicit_into_iter_loop;
@@ -740,6 +741,49 @@ declare_clippy_lint! {
     "manually filling a slice with a value"
 }
 
+declare_clippy_lint! {
+    /// ### What it does
+    /// Checks for usage of a character position yielded by `.chars().enumerate()` in a context where a **byte index** is expected,
+    /// such as an argument to a specific `str` method or indexing into a `str` or `String`.
+    ///
+    /// ### Why is this bad?
+    /// A character (more specifically, a Unicode scalar value) that is yielded by `str::chars` can take up multiple bytes,
+    /// so a character position does not necessarily have the same byte index at which the character is stored.
+    /// Thus, using the character position where a byte index is expected can unexpectedly return wrong values
+    /// or panic when the string consists of multibyte characters.
+    ///
+    /// For example, the character `a` in `äa` is stored at byte index 2 but has the character position 1.
+    /// Using the character position 1 to index into the string will lead to a panic as it is in the middle of the first character.
+    ///
+    /// Instead of `.chars().enumerate()`, the correct iterator to use is `.char_indices()`, which yields byte indices.
+    ///
+    /// This pattern is technically fine if the strings are known to only use the ASCII subset,
+    /// though in those cases it would be better to use `bytes()` directly to make the intent clearer,
+    /// but there is also no downside to just using `.char_indices()` directly and supporting non-ASCII strings.
+    ///
+    /// You may also want to read the [chapter on strings in the Rust Book](https://doc.rust-lang.org/book/ch08-02-strings.html)
+    /// which goes into this in more detail.
+    ///
+    /// ### Example
+    /// ```no_run
+    /// # let s = "...";
+    /// for (idx, c) in s.chars().enumerate() {
+    ///     let _ = s[idx..]; // ⚠️ Panics for strings consisting of multibyte characters
+    /// }
+    /// ```
+    /// Use instead:
+    /// ```no_run
+    /// # let s = "...";
+    /// for (idx, c) in s.char_indices() {
+    ///     let _ = s[idx..];
+    /// }
+    /// ```
+    #[clippy::version = "1.83.0"]
+    pub CHAR_INDICES_AS_BYTE_INDICES,
+    correctness,
+    "using the character position yielded by `.chars().enumerate()` in a context where a byte index is expected"
+}
+
 pub struct Loops {
     msrv: Msrv,
     enforce_iter_loop_reborrow: bool,
@@ -777,6 +821,7 @@ impl_lint_pass!(Loops => [
     UNUSED_ENUMERATE_INDEX,
     INFINITE_LOOP,
     MANUAL_SLICE_FILL,
+    CHAR_INDICES_AS_BYTE_INDICES,
 ]);
 
 impl<'tcx> LateLintPass<'tcx> for Loops {
@@ -860,6 +905,7 @@ impl Loops {
         manual_flatten::check(cx, pat, arg, body, span, self.msrv);
         manual_find::check(cx, pat, arg, body, span, expr);
         unused_enumerate_index::check(cx, pat, arg, body);
+        char_indices_as_byte_indices::check(cx, pat, arg, body);
     }
 
     fn check_for_loop_arg(&self, cx: &LateContext<'_>, _: &Pat<'_>, arg: &Expr<'_>) {
diff --git a/tests/ui/char_indices_as_byte_indices.fixed b/tests/ui/char_indices_as_byte_indices.fixed
@@ -0,0 +1,65 @@
+#![feature(round_char_boundary)]
+#![warn(clippy::char_indices_as_byte_indices)]
+
+trait StrExt {
+    fn use_index(&self, _: usize);
+}
+impl StrExt for str {
+    fn use_index(&self, _: usize) {}
+}
+
+fn bad(prim: &str, string: String) {
+    for (idx, _) in prim.char_indices() {
+        let _ = prim[..idx];
+        //~^ char_indices_as_byte_indices
+        prim.split_at(idx);
+        //~^ char_indices_as_byte_indices
+
+        // This won't panic, but it can still return a wrong substring
+        let _ = prim[..prim.floor_char_boundary(idx)];
+        //~^ char_indices_as_byte_indices
+
+        // can't use #[expect] here because the .fixed file will still have the attribute and create an
+        // unfulfilled expectation, but make sure lint level attributes work on the use expression:
+        #[allow(clippy::char_indices_as_byte_indices)]
+        let _ = prim[..idx];
+    }
+
+    for c in prim.char_indices() {
+        let _ = prim[..c.0];
+        //~^ char_indices_as_byte_indices
+        prim.split_at(c.0);
+        //~^ char_indices_as_byte_indices
+    }
+
+    for (idx, _) in string.char_indices() {
+        let _ = string[..idx];
+        //~^ char_indices_as_byte_indices
+        string.split_at(idx);
+        //~^ char_indices_as_byte_indices
+    }
+}
+
+fn good(prim: &str, prim2: &str) {
+    for (idx, _) in prim.chars().enumerate() {
+        // Indexing into a different string
+        let _ = prim2[..idx];
+
+        // Unknown use
+        std::hint::black_box(idx);
+
+        // Method call to user defined extension trait
+        prim.use_index(idx);
+
+        // str method taking a usize that doesn't represent a byte index
+        prim.splitn(idx, prim2);
+    }
+
+    let mut string = "äa".to_owned();
+    for (idx, _) in string.clone().chars().enumerate() {
+        // Even though the receiver is the same expression, it should not be treated as the same value.
+        string.clone().remove(idx);
+    }
+}
+
+fn main() {}
diff --git a/tests/ui/char_indices_as_byte_indices.rs b/tests/ui/char_indices_as_byte_indices.rs
@@ -0,0 +1,65 @@
+#![feature(round_char_boundary)]
+#![warn(clippy::char_indices_as_byte_indices)]
+
+trait StrExt {
+    fn use_index(&self, _: usize);
+}
+impl StrExt for str {
+    fn use_index(&self, _: usize) {}
+}
+
+fn bad(prim: &str, string: String) {
+    for (idx, _) in prim.chars().enumerate() {
+        let _ = prim[..idx];
+        //~^ char_indices_as_byte_indices
+        prim.split_at(idx);
+        //~^ char_indices_as_byte_indices
+
+        // This won't panic, but it can still return a wrong substring
+        let _ = prim[..prim.floor_char_boundary(idx)];
+        //~^ char_indices_as_byte_indices
+
+        // can't use #[expect] here because the .fixed file will still have the attribute and create an
+        // unfulfilled expectation, but make sure lint level attributes work on the use expression:
+        #[allow(clippy::char_indices_as_byte_indices)]
+        let _ = prim[..idx];
+    }
+
+    for c in prim.chars().enumerate() {
+        let _ = prim[..c.0];
+        //~^ char_indices_as_byte_indices
+        prim.split_at(c.0);
+        //~^ char_indices_as_byte_indices
+    }
+
+    for (idx, _) in string.chars().enumerate() {
+        let _ = string[..idx];
+        //~^ char_indices_as_byte_indices
+        string.split_at(idx);
+        //~^ char_indices_as_byte_indices
+    }
+}
+
+fn good(prim: &str, prim2: &str) {
+    for (idx, _) in prim.chars().enumerate() {
+        // Indexing into a different string
+        let _ = prim2[..idx];
+
+        // Unknown use
+        std::hint::black_box(idx);
+
+        // Method call to user defined extension trait
+        prim.use_index(idx);
+
+        // str method taking a usize that doesn't represent a byte index
+        prim.splitn(idx, prim2);
+    }
+
+    let mut string = "äa".to_owned();
+    for (idx, _) in string.clone().chars().enumerate() {
+        // Even though the receiver is the same expression, it should not be treated as the same value.
+        string.clone().remove(idx);
+    }
+}
+
+fn main() {}
diff --git a/tests/ui/char_indices_as_byte_indices.stderr b/tests/ui/char_indices_as_byte_indices.stderr