Skip to content

Commit 4f58673

Browse files
authored
new lint: char_indices_as_byte_indices (rust-lang#13435)
Closes rust-lang#10202. This adds a new lint that checks for uses of the `.chars().enumerate()` position in a context where a byte index is required and suggests changing it to use `.char_indices()` instead. I'm planning to extend this lint to also detect uses of the position in iterator chains, e.g. `s.chars().enumerate().for_each(|(i, _)| s.split_at(i));`, but that's for another time ---------------- changelog: new lint: `chars_enumerate_for_byte_indices`
2 parents cd67b3b + c739027 commit 4f58673

File tree

7 files changed

+449
-0
lines changed

7 files changed

+449
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5516,6 +5516,7 @@ Released 2018-09-13
55165516
[`cast_slice_different_sizes`]: https://rust-lang.github.io/rust-clippy/master/index.html#cast_slice_different_sizes
55175517
[`cast_slice_from_raw_parts`]: https://rust-lang.github.io/rust-clippy/master/index.html#cast_slice_from_raw_parts
55185518
[`cfg_not_test`]: https://rust-lang.github.io/rust-clippy/master/index.html#cfg_not_test
5519+
[`char_indices_as_byte_indices`]: https://rust-lang.github.io/rust-clippy/master/index.html#char_indices_as_byte_indices
55195520
[`char_lit_as_u8`]: https://rust-lang.github.io/rust-clippy/master/index.html#char_lit_as_u8
55205521
[`chars_last_cmp`]: https://rust-lang.github.io/rust-clippy/master/index.html#chars_last_cmp
55215522
[`chars_next_cmp`]: https://rust-lang.github.io/rust-clippy/master/index.html#chars_next_cmp

clippy_lints/src/declared_lints.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,7 @@ pub static LINTS: &[&crate::LintInfo] = &[
287287
crate::literal_representation::UNREADABLE_LITERAL_INFO,
288288
crate::literal_representation::UNUSUAL_BYTE_GROUPINGS_INFO,
289289
crate::literal_string_with_formatting_args::LITERAL_STRING_WITH_FORMATTING_ARGS_INFO,
290+
crate::loops::CHAR_INDICES_AS_BYTE_INDICES_INFO,
290291
crate::loops::EMPTY_LOOP_INFO,
291292
crate::loops::EXPLICIT_COUNTER_LOOP_INFO,
292293
crate::loops::EXPLICIT_INTO_ITER_LOOP_INFO,
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
use std::ops::ControlFlow;
2+
3+
use clippy_utils::diagnostics::span_lint_hir_and_then;
4+
use clippy_utils::ty::is_type_lang_item;
5+
use clippy_utils::visitors::for_each_expr;
6+
use clippy_utils::{eq_expr_value, higher, path_to_local_id};
7+
use rustc_errors::{Applicability, MultiSpan};
8+
use rustc_hir::{Expr, ExprKind, LangItem, Node, Pat, PatKind};
9+
use rustc_lint::LateContext;
10+
use rustc_middle::ty::Ty;
11+
use rustc_span::{Span, sym};
12+
13+
use super::CHAR_INDICES_AS_BYTE_INDICES;
14+
15+
// The list of `str` methods we want to lint that have a `usize` argument representing a byte index.
16+
// Note: `String` also has methods that work with byte indices,
17+
// but they all take `&mut self` and aren't worth considering since the user couldn't have called
18+
// them while the chars iterator is live anyway.
19+
const BYTE_INDEX_METHODS: &[&str] = &[
20+
"is_char_boundary",
21+
"floor_char_boundary",
22+
"ceil_char_boundary",
23+
"get",
24+
"index",
25+
"index_mut",
26+
"get_mut",
27+
"get_unchecked",
28+
"get_unchecked_mut",
29+
"slice_unchecked",
30+
"slice_mut_unchecked",
31+
"split_at",
32+
"split_at_mut",
33+
"split_at_checked",
34+
"split_at_mut_checked",
35+
];
36+
37+
const CONTINUE: ControlFlow<!, ()> = ControlFlow::Continue(());
38+
39+
pub(super) fn check<'tcx>(cx: &LateContext<'tcx>, pat: &Pat<'_>, iterable: &Expr<'_>, body: &'tcx Expr<'tcx>) {
40+
if let ExprKind::MethodCall(_, enumerate_recv, _, enumerate_span) = iterable.kind
41+
&& let Some(method_id) = cx.typeck_results().type_dependent_def_id(iterable.hir_id)
42+
&& cx.tcx.is_diagnostic_item(sym::enumerate_method, method_id)
43+
&& let ExprKind::MethodCall(_, chars_recv, _, chars_span) = enumerate_recv.kind
44+
&& let Some(method_id) = cx.typeck_results().type_dependent_def_id(enumerate_recv.hir_id)
45+
&& cx.tcx.is_diagnostic_item(sym::str_chars, method_id)
46+
{
47+
if let PatKind::Tuple([pat, _], _) = pat.kind
48+
&& let PatKind::Binding(_, binding_id, ..) = pat.kind
49+
{
50+
// Destructured iterator element `(idx, _)`, look for uses of the binding
51+
for_each_expr(cx, body, |expr| {
52+
if path_to_local_id(expr, binding_id) {
53+
check_index_usage(cx, expr, pat, enumerate_span, chars_span, chars_recv);
54+
}
55+
CONTINUE
56+
});
57+
} else if let PatKind::Binding(_, binding_id, ..) = pat.kind {
58+
// Bound as a tuple, look for `tup.0`
59+
for_each_expr(cx, body, |expr| {
60+
if let ExprKind::Field(e, field) = expr.kind
61+
&& path_to_local_id(e, binding_id)
62+
&& field.name == sym::integer(0)
63+
{
64+
check_index_usage(cx, expr, pat, enumerate_span, chars_span, chars_recv);
65+
}
66+
CONTINUE
67+
});
68+
}
69+
}
70+
}
71+
72+
fn check_index_usage<'tcx>(
73+
cx: &LateContext<'tcx>,
74+
expr: &'tcx Expr<'tcx>,
75+
pat: &Pat<'_>,
76+
enumerate_span: Span,
77+
chars_span: Span,
78+
chars_recv: &Expr<'_>,
79+
) {
80+
let Some(parent_expr) = index_consumed_at(cx, expr) else {
81+
return;
82+
};
83+
84+
let is_string_like = |ty: Ty<'_>| ty.is_str() || is_type_lang_item(cx, ty, LangItem::String);
85+
let message = match parent_expr.kind {
86+
ExprKind::MethodCall(segment, recv, ..)
87+
// We currently only lint `str` methods (which `String` can deref to), so a `.is_str()` check is sufficient here
88+
// (contrary to the `ExprKind::Index` case which needs to handle both with `is_string_like` because `String` implements
89+
// `Index` directly and no deref to `str` would happen in that case).
90+
if cx.typeck_results().expr_ty_adjusted(recv).peel_refs().is_str()
91+
&& BYTE_INDEX_METHODS.contains(&segment.ident.name.as_str())
92+
&& eq_expr_value(cx, chars_recv, recv) =>
93+
{
94+
"passing a character position to a method that expects a byte index"
95+
},
96+
ExprKind::Index(target, ..)
97+
if is_string_like(cx.typeck_results().expr_ty_adjusted(target).peel_refs())
98+
&& eq_expr_value(cx, chars_recv, target) =>
99+
{
100+
"indexing into a string with a character position where a byte index is expected"
101+
},
102+
_ => return,
103+
};
104+
105+
span_lint_hir_and_then(
106+
cx,
107+
CHAR_INDICES_AS_BYTE_INDICES,
108+
expr.hir_id,
109+
expr.span,
110+
message,
111+
|diag| {
112+
diag.note("a character can take up more than one byte, so they are not interchangeable")
113+
.span_note(
114+
MultiSpan::from_spans(vec![pat.span, enumerate_span]),
115+
"position comes from the enumerate iterator",
116+
)
117+
.span_suggestion_verbose(
118+
chars_span.to(enumerate_span),
119+
"consider using `.char_indices()` instead",
120+
"char_indices()",
121+
Applicability::MaybeIncorrect,
122+
);
123+
},
124+
);
125+
}
126+
127+
/// Returns the expression which ultimately consumes the index.
128+
/// This is usually the parent expression, i.e. `.split_at(idx)` for `idx`,
129+
/// but for `.get(..idx)` we want to consider the method call the consuming expression,
130+
/// which requires skipping past the range expression.
131+
fn index_consumed_at<'tcx>(cx: &LateContext<'tcx>, expr: &'tcx Expr<'tcx>) -> Option<&'tcx Expr<'tcx>> {
132+
for (_, node) in cx.tcx.hir_parent_iter(expr.hir_id) {
133+
match node {
134+
Node::Expr(expr) if higher::Range::hir(expr).is_some() => {},
135+
Node::ExprField(_) => {},
136+
Node::Expr(expr) => return Some(expr),
137+
_ => break,
138+
}
139+
}
140+
None
141+
}

clippy_lints/src/loops/mod.rs

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
mod char_indices_as_byte_indices;
12
mod empty_loop;
23
mod explicit_counter_loop;
34
mod explicit_into_iter_loop;
@@ -740,6 +741,49 @@ declare_clippy_lint! {
740741
"manually filling a slice with a value"
741742
}
742743

744+
declare_clippy_lint! {
745+
/// ### What it does
746+
/// Checks for usage of a character position yielded by `.chars().enumerate()` in a context where a **byte index** is expected,
747+
/// such as an argument to a specific `str` method or indexing into a `str` or `String`.
748+
///
749+
/// ### Why is this bad?
750+
/// A character (more specifically, a Unicode scalar value) that is yielded by `str::chars` can take up multiple bytes,
751+
/// so a character position does not necessarily have the same byte index at which the character is stored.
752+
/// Thus, using the character position where a byte index is expected can unexpectedly return wrong values
753+
/// or panic when the string consists of multibyte characters.
754+
///
755+
/// For example, the character `a` in `äa` is stored at byte index 2 but has the character position 1.
756+
/// Using the character position 1 to index into the string will lead to a panic as it is in the middle of the first character.
757+
///
758+
/// Instead of `.chars().enumerate()`, the correct iterator to use is `.char_indices()`, which yields byte indices.
759+
///
760+
/// This pattern is technically fine if the strings are known to only use the ASCII subset,
761+
/// though in those cases it would be better to use `bytes()` directly to make the intent clearer,
762+
/// but there is also no downside to just using `.char_indices()` directly and supporting non-ASCII strings.
763+
///
764+
/// You may also want to read the [chapter on strings in the Rust Book](https://doc.rust-lang.org/book/ch08-02-strings.html)
765+
/// which goes into this in more detail.
766+
///
767+
/// ### Example
768+
/// ```no_run
769+
/// # let s = "...";
770+
/// for (idx, c) in s.chars().enumerate() {
771+
/// let _ = s[idx..]; // ⚠️ Panics for strings consisting of multibyte characters
772+
/// }
773+
/// ```
774+
/// Use instead:
775+
/// ```no_run
776+
/// # let s = "...";
777+
/// for (idx, c) in s.char_indices() {
778+
/// let _ = s[idx..];
779+
/// }
780+
/// ```
781+
#[clippy::version = "1.83.0"]
782+
pub CHAR_INDICES_AS_BYTE_INDICES,
783+
correctness,
784+
"using the character position yielded by `.chars().enumerate()` in a context where a byte index is expected"
785+
}
786+
743787
pub struct Loops {
744788
msrv: Msrv,
745789
enforce_iter_loop_reborrow: bool,
@@ -777,6 +821,7 @@ impl_lint_pass!(Loops => [
777821
UNUSED_ENUMERATE_INDEX,
778822
INFINITE_LOOP,
779823
MANUAL_SLICE_FILL,
824+
CHAR_INDICES_AS_BYTE_INDICES,
780825
]);
781826

782827
impl<'tcx> LateLintPass<'tcx> for Loops {
@@ -860,6 +905,7 @@ impl Loops {
860905
manual_flatten::check(cx, pat, arg, body, span, self.msrv);
861906
manual_find::check(cx, pat, arg, body, span, expr);
862907
unused_enumerate_index::check(cx, pat, arg, body);
908+
char_indices_as_byte_indices::check(cx, pat, arg, body);
863909
}
864910

865911
fn check_for_loop_arg(&self, cx: &LateContext<'_>, _: &Pat<'_>, arg: &Expr<'_>) {
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#![feature(round_char_boundary)]
2+
#![warn(clippy::char_indices_as_byte_indices)]
3+
4+
trait StrExt {
5+
fn use_index(&self, _: usize);
6+
}
7+
impl StrExt for str {
8+
fn use_index(&self, _: usize) {}
9+
}
10+
11+
fn bad(prim: &str, string: String) {
12+
for (idx, _) in prim.char_indices() {
13+
let _ = prim[..idx];
14+
//~^ char_indices_as_byte_indices
15+
prim.split_at(idx);
16+
//~^ char_indices_as_byte_indices
17+
18+
// This won't panic, but it can still return a wrong substring
19+
let _ = prim[..prim.floor_char_boundary(idx)];
20+
//~^ char_indices_as_byte_indices
21+
22+
// can't use #[expect] here because the .fixed file will still have the attribute and create an
23+
// unfulfilled expectation, but make sure lint level attributes work on the use expression:
24+
#[allow(clippy::char_indices_as_byte_indices)]
25+
let _ = prim[..idx];
26+
}
27+
28+
for c in prim.char_indices() {
29+
let _ = prim[..c.0];
30+
//~^ char_indices_as_byte_indices
31+
prim.split_at(c.0);
32+
//~^ char_indices_as_byte_indices
33+
}
34+
35+
for (idx, _) in string.char_indices() {
36+
let _ = string[..idx];
37+
//~^ char_indices_as_byte_indices
38+
string.split_at(idx);
39+
//~^ char_indices_as_byte_indices
40+
}
41+
}
42+
43+
fn good(prim: &str, prim2: &str) {
44+
for (idx, _) in prim.chars().enumerate() {
45+
// Indexing into a different string
46+
let _ = prim2[..idx];
47+
48+
// Unknown use
49+
std::hint::black_box(idx);
50+
51+
// Method call to user defined extension trait
52+
prim.use_index(idx);
53+
54+
// str method taking a usize that doesn't represent a byte index
55+
prim.splitn(idx, prim2);
56+
}
57+
58+
let mut string = "äa".to_owned();
59+
for (idx, _) in string.clone().chars().enumerate() {
60+
// Even though the receiver is the same expression, it should not be treated as the same value.
61+
string.clone().remove(idx);
62+
}
63+
}
64+
65+
fn main() {}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#![feature(round_char_boundary)]
2+
#![warn(clippy::char_indices_as_byte_indices)]
3+
4+
trait StrExt {
5+
fn use_index(&self, _: usize);
6+
}
7+
impl StrExt for str {
8+
fn use_index(&self, _: usize) {}
9+
}
10+
11+
fn bad(prim: &str, string: String) {
12+
for (idx, _) in prim.chars().enumerate() {
13+
let _ = prim[..idx];
14+
//~^ char_indices_as_byte_indices
15+
prim.split_at(idx);
16+
//~^ char_indices_as_byte_indices
17+
18+
// This won't panic, but it can still return a wrong substring
19+
let _ = prim[..prim.floor_char_boundary(idx)];
20+
//~^ char_indices_as_byte_indices
21+
22+
// can't use #[expect] here because the .fixed file will still have the attribute and create an
23+
// unfulfilled expectation, but make sure lint level attributes work on the use expression:
24+
#[allow(clippy::char_indices_as_byte_indices)]
25+
let _ = prim[..idx];
26+
}
27+
28+
for c in prim.chars().enumerate() {
29+
let _ = prim[..c.0];
30+
//~^ char_indices_as_byte_indices
31+
prim.split_at(c.0);
32+
//~^ char_indices_as_byte_indices
33+
}
34+
35+
for (idx, _) in string.chars().enumerate() {
36+
let _ = string[..idx];
37+
//~^ char_indices_as_byte_indices
38+
string.split_at(idx);
39+
//~^ char_indices_as_byte_indices
40+
}
41+
}
42+
43+
fn good(prim: &str, prim2: &str) {
44+
for (idx, _) in prim.chars().enumerate() {
45+
// Indexing into a different string
46+
let _ = prim2[..idx];
47+
48+
// Unknown use
49+
std::hint::black_box(idx);
50+
51+
// Method call to user defined extension trait
52+
prim.use_index(idx);
53+
54+
// str method taking a usize that doesn't represent a byte index
55+
prim.splitn(idx, prim2);
56+
}
57+
58+
let mut string = "äa".to_owned();
59+
for (idx, _) in string.clone().chars().enumerate() {
60+
// Even though the receiver is the same expression, it should not be treated as the same value.
61+
string.clone().remove(idx);
62+
}
63+
}
64+
65+
fn main() {}

0 commit comments

Comments
 (0)