|
1 |
| -//! Levenshtein distances. |
| 1 | +//! Damerau-Levenshtein distances. |
2 | 2 | //!
|
3 |
| -//! The [Levenshtein distance] is a metric for measuring the difference between two strings. |
| 3 | +//! The [Damerau-Levenshtein distance] is a metric for measuring the difference between two strings. |
| 4 | +//! This implementation is a restricted version of the algorithm, as it does not permit modifying |
| 5 | +//! characters that have already been transposed. |
4 | 6 | //!
|
5 |
| -//! [Levenshtein distance]: https://en.wikipedia.org/wiki/Levenshtein_distance |
| 7 | +//! [Damerau-Levenshtein distance]: https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance |
6 | 8 |
|
7 | 9 | use crate::symbol::Symbol;
|
8 |
| -use std::cmp; |
| 10 | +use std::{cmp, mem}; |
9 | 11 |
|
10 | 12 | #[cfg(test)]
|
11 | 13 | mod tests;
|
12 | 14 |
|
13 |
| -/// Finds the Levenshtein distance between two strings. |
| 15 | +/// Finds the restricted Damerau-Levenshtein distance between two strings. Characters that have |
| 16 | +/// already been transposed may not be modified. |
14 | 17 | ///
|
15 | 18 | /// Returns None if the distance exceeds the limit.
|
16 | 19 | pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
|
17 |
| - let n = a.chars().count(); |
18 |
| - let m = b.chars().count(); |
19 |
| - let min_dist = if n < m { m - n } else { n - m }; |
| 20 | + let mut a = &a.chars().collect::<Vec<_>>()[..]; |
| 21 | + let mut b = &b.chars().collect::<Vec<_>>()[..]; |
20 | 22 |
|
| 23 | + // Ensure that `b` is the shorter string, minimizing memory use. |
| 24 | + if a.len() < b.len() { |
| 25 | + mem::swap(&mut a, &mut b); |
| 26 | + } |
| 27 | + |
| 28 | + let min_dist = a.len() - b.len(); |
| 29 | + // If we know the limit will be exceeded, we can return early. |
21 | 30 | if min_dist > limit {
|
22 | 31 | return None;
|
23 | 32 | }
|
24 |
| - if n == 0 || m == 0 { |
25 |
| - return (min_dist <= limit).then_some(min_dist); |
26 |
| - } |
27 | 33 |
|
28 |
| - let mut dcol: Vec<_> = (0..=m).collect(); |
| 34 | + // Strip common prefix. |
| 35 | + while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_first().zip(a.split_first()) |
| 36 | + && a_char == b_char |
| 37 | + { |
| 38 | + a = a_rest; |
| 39 | + b = b_rest; |
| 40 | + } |
| 41 | + // Strip common suffix. |
| 42 | + while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_last().zip(a.split_last()) |
| 43 | + && a_char == b_char |
| 44 | + { |
| 45 | + a = a_rest; |
| 46 | + b = b_rest; |
| 47 | + } |
29 | 48 |
|
30 |
| - for (i, sc) in a.chars().enumerate() { |
31 |
| - let mut current = i; |
32 |
| - dcol[0] = current + 1; |
| 49 | + // If either string is empty, the distance is the length of the other. |
| 50 | + // We know that `b` is the shorter string, so we don't need to check `a`. |
| 51 | + if b.len() == 0 { |
| 52 | + return Some(min_dist); |
| 53 | + } |
33 | 54 |
|
34 |
| - for (j, tc) in b.chars().enumerate() { |
35 |
| - let next = dcol[j + 1]; |
36 |
| - if sc == tc { |
37 |
| - dcol[j + 1] = current; |
38 |
| - } else { |
39 |
| - dcol[j + 1] = cmp::min(current, next); |
40 |
| - dcol[j + 1] = cmp::min(dcol[j + 1], dcol[j]) + 1; |
| 55 | + let mut prev_prev = vec![usize::MAX; b.len() + 1]; |
| 56 | + let mut prev = (0..=b.len()).collect::<Vec<_>>(); |
| 57 | + let mut current = vec![0; b.len() + 1]; |
| 58 | + |
| 59 | + // row by row |
| 60 | + for i in 1..=a.len() { |
| 61 | + current[0] = i; |
| 62 | + let a_idx = i - 1; |
| 63 | + |
| 64 | + // column by column |
| 65 | + for j in 1..=b.len() { |
| 66 | + let b_idx = j - 1; |
| 67 | + |
| 68 | + // There is no cost to substitute a character with itself. |
| 69 | + let substitution_cost = if a[a_idx] == b[b_idx] { 0 } else { 1 }; |
| 70 | + |
| 71 | + current[j] = cmp::min( |
| 72 | + // deletion |
| 73 | + prev[j] + 1, |
| 74 | + cmp::min( |
| 75 | + // insertion |
| 76 | + current[j - 1] + 1, |
| 77 | + // substitution |
| 78 | + prev[j - 1] + substitution_cost, |
| 79 | + ), |
| 80 | + ); |
| 81 | + |
| 82 | + if (i > 1) && (j > 1) && (a[a_idx] == b[b_idx - 1]) && (a[a_idx - 1] == b[b_idx]) { |
| 83 | + // transposition |
| 84 | + current[j] = cmp::min(current[j], prev_prev[j - 2] + 1); |
41 | 85 | }
|
42 |
| - current = next; |
43 | 86 | }
|
| 87 | + |
| 88 | + // Rotate the buffers, reusing the memory. |
| 89 | + [prev_prev, prev, current] = [prev, current, prev_prev]; |
44 | 90 | }
|
45 | 91 |
|
46 |
| - (dcol[m] <= limit).then_some(dcol[m]) |
| 92 | + // `prev` because we already rotated the buffers. |
| 93 | + let distance = prev[b.len()]; |
| 94 | + (distance <= limit).then_some(distance) |
47 | 95 | }
|
48 | 96 |
|
49 | 97 | /// Provides a word similarity score between two words that accounts for substrings being more
|
|
0 commit comments