Skip to content

Commit ff052ee

Browse files
committed
Use restricted Damerau-Levenshtein algorithm
1 parent 231bcd1 commit ff052ee

File tree

3 files changed

+76
-25
lines changed

3 files changed

+76
-25
lines changed

compiler/rustc_span/src/lev_distance.rs

Lines changed: 72 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,97 @@
1-
//! Levenshtein distances.
1+
//! Damerau-Levenshtein distances.
22
//!
3-
//! The [Levenshtein distance] is a metric for measuring the difference between two strings.
3+
//! The [Damerau-Levenshtein distance] is a metric for measuring the difference between two strings.
4+
//! This implementation is a restricted version of the algorithm, as it does not permit modifying
5+
//! characters that have already been transposed.
46
//!
5-
//! [Levenshtein distance]: https://en.wikipedia.org/wiki/Levenshtein_distance
7+
//! [Damerau-Levenshtein distance]: https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
68
79
use crate::symbol::Symbol;
8-
use std::cmp;
10+
use std::{cmp, mem};
911

1012
#[cfg(test)]
1113
mod tests;
1214

13-
/// Finds the Levenshtein distance between two strings.
15+
/// Finds the restricted Damerau-Levenshtein distance between two strings. Characters that have
16+
/// already been transposed may not be modified.
1417
///
1518
/// Returns None if the distance exceeds the limit.
1619
pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
17-
let n = a.chars().count();
18-
let m = b.chars().count();
19-
let min_dist = if n < m { m - n } else { n - m };
20+
let mut a = &a.chars().collect::<Vec<_>>()[..];
21+
let mut b = &b.chars().collect::<Vec<_>>()[..];
2022

23+
// Ensure that `b` is the shorter string, minimizing memory use.
24+
if a.len() < b.len() {
25+
mem::swap(&mut a, &mut b);
26+
}
27+
28+
let min_dist = a.len() - b.len();
29+
// If we know the limit will be exceeded, we can return early.
2130
if min_dist > limit {
2231
return None;
2332
}
24-
if n == 0 || m == 0 {
25-
return (min_dist <= limit).then_some(min_dist);
26-
}
2733

28-
let mut dcol: Vec<_> = (0..=m).collect();
34+
// Strip common prefix.
35+
while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_first().zip(a.split_first())
36+
&& a_char == b_char
37+
{
38+
a = a_rest;
39+
b = b_rest;
40+
}
41+
// Strip common suffix.
42+
while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_last().zip(a.split_last())
43+
&& a_char == b_char
44+
{
45+
a = a_rest;
46+
b = b_rest;
47+
}
2948

30-
for (i, sc) in a.chars().enumerate() {
31-
let mut current = i;
32-
dcol[0] = current + 1;
49+
// If either string is empty, the distance is the length of the other.
50+
// We know that `b` is the shorter string, so we don't need to check `a`.
51+
if b.len() == 0 {
52+
return Some(min_dist);
53+
}
3354

34-
for (j, tc) in b.chars().enumerate() {
35-
let next = dcol[j + 1];
36-
if sc == tc {
37-
dcol[j + 1] = current;
38-
} else {
39-
dcol[j + 1] = cmp::min(current, next);
40-
dcol[j + 1] = cmp::min(dcol[j + 1], dcol[j]) + 1;
55+
let mut prev_prev = vec![usize::MAX; b.len() + 1];
56+
let mut prev = (0..=b.len()).collect::<Vec<_>>();
57+
let mut current = vec![0; b.len() + 1];
58+
59+
// row by row
60+
for i in 1..=a.len() {
61+
current[0] = i;
62+
let a_idx = i - 1;
63+
64+
// column by column
65+
for j in 1..=b.len() {
66+
let b_idx = j - 1;
67+
68+
// There is no cost to substitute a character with itself.
69+
let substitution_cost = if a[a_idx] == b[b_idx] { 0 } else { 1 };
70+
71+
current[j] = cmp::min(
72+
// deletion
73+
prev[j] + 1,
74+
cmp::min(
75+
// insertion
76+
current[j - 1] + 1,
77+
// substitution
78+
prev[j - 1] + substitution_cost,
79+
),
80+
);
81+
82+
if (i > 1) && (j > 1) && (a[a_idx] == b[b_idx - 1]) && (a[a_idx - 1] == b[b_idx]) {
83+
// transposition
84+
current[j] = cmp::min(current[j], prev_prev[j - 2] + 1);
4185
}
42-
current = next;
4386
}
87+
88+
// Rotate the buffers, reusing the memory.
89+
[prev_prev, prev, current] = [prev, current, prev_prev];
4490
}
4591

46-
(dcol[m] <= limit).then_some(dcol[m])
92+
// `prev` because we already rotated the buffers.
93+
let distance = prev[b.len()];
94+
(distance <= limit).then_some(distance)
4795
}
4896

4997
/// Provides a word similarity score between two words that accounts for substrings being more

compiler/rustc_span/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#![feature(negative_impls)]
2020
#![feature(min_specialization)]
2121
#![feature(rustc_attrs)]
22+
#![feature(let_chains)]
2223
#![deny(rustc::untranslatable_diagnostic)]
2324
#![deny(rustc::diagnostic_outside_of_impl)]
2425

tests/ui/check-cfg/invalid-cfg-value.stderr

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@ warning: unexpected `cfg` condition value
22
--> $DIR/invalid-cfg-value.rs:7:7
33
|
44
LL | #[cfg(feature = "sedre")]
5-
| ^^^^^^^^^^^^^^^^^
5+
| ^^^^^^^^^^-------
6+
| |
7+
| help: did you mean: `"serde"`
68
|
79
= note: expected values for `feature` are: full, serde
810
= note: `#[warn(unexpected_cfgs)]` on by default

0 commit comments

Comments
 (0)