Skip to content

Commit 55a1fc9

Browse files
committed
Fixes #169.
This fixes an issue where prefix literals could contain alternates such that an Aho-Corasick match could be ambiguous. More specifically, AC returns matches in the order in which they appear in the text rather than specifically "leftmost first." This means that if an alternate is a substring of another alternate, then AC will return a match for that first even if the other alternate is left of it. In essence, this causes a prefix scan to skip too far ahead in the text. We fix this by ensuring that prefix literals are unambiguous by truncating all literals such that none of them are substrings of another. This is written in a way that is somewhat expensive, but we constrain the number of literals greatly, so the quadratic runtime shouldn't be too noticeable.
1 parent 277926f commit 55a1fc9

File tree

3 files changed

+187
-89
lines changed

3 files changed

+187
-89
lines changed

src/literals.rs

Lines changed: 183 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use memchr::{memchr, memchr2, memchr3};
1919
use char_utf8::encode_utf8;
2020
use inst::{Insts, Inst, InstBytes, InstRanges};
2121

22+
#[derive(Clone, Eq, PartialEq)]
2223
pub struct AlternateLiterals {
2324
at_match: bool,
2425
literals: Vec<Vec<u8>>,
@@ -29,10 +30,11 @@ impl AlternateLiterals {
2930
if self.literals.is_empty() {
3031
Literals::empty()
3132
} else {
32-
let at_match = self.at_match;
33+
let alts = self.unambiguous();
34+
let at_match = alts.at_match;
3335
Literals {
3436
at_match: at_match,
35-
matcher: LiteralMatcher::new(self),
37+
matcher: LiteralMatcher::new(alts),
3638
}
3739
}
3840
}
@@ -53,6 +55,13 @@ impl AlternateLiterals {
5355
self.literals.len() >= 1 && self.literals.iter().all(|s| s.len() == 1)
5456
}
5557

58+
fn all_same_length(&self) -> bool {
59+
if self.literals.is_empty() {
60+
return true;
61+
}
62+
self.literals.iter().all(|s| s.len() == self.literals[0].len())
63+
}
64+
5665
fn is_one_literal(&self) -> bool {
5766
self.literals.len() == 1
5867
}
@@ -123,6 +132,71 @@ impl AlternateLiterals {
123132
}
124133
}
125134
}
135+
136+
/// Returns a new set of alternate literals that are guaranteed to be
137+
/// unambiguous.
138+
///
139+
/// State differently, the set of literals returned is guaranteed to never
140+
/// result in any overlapping matches.
141+
///
142+
/// Duplicate literals are dropped. Literals that are otherwise distinct
143+
/// will be possibly truncated.
144+
fn unambiguous(&self) -> AlternateLiterals {
145+
fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
146+
let mut i = 0;
147+
while haystack.len() >= needle.len() {
148+
if needle == &haystack[..needle.len()] {
149+
return Some(i);
150+
}
151+
i += 1;
152+
haystack = &haystack[1..];
153+
}
154+
None
155+
}
156+
157+
// This function is a bit gratuitous and allocation heavy, but in
158+
// general, we limit the number of alternates to be pretty small.
159+
160+
if self.all_same_length() {
161+
return self.clone();
162+
}
163+
let mut new = AlternateLiterals {
164+
at_match: self.at_match,
165+
literals: Vec::with_capacity(self.literals.len()),
166+
};
167+
'OUTER:
168+
for lit1 in &self.literals {
169+
if new.literals.is_empty() {
170+
new.literals.push(lit1.clone());
171+
continue;
172+
}
173+
let mut candidate = lit1.clone();
174+
for lit2 in &mut new.literals {
175+
if &candidate == lit2 {
176+
// If the literal is already in the set, then we can
177+
// just drop it.
178+
continue 'OUTER;
179+
}
180+
if lit1.len() <= lit2.len() {
181+
if let Some(i) = position(&candidate, lit2) {
182+
new.at_match = false;
183+
lit2.truncate(i);
184+
}
185+
} else {
186+
if let Some(i) = position(lit2, &candidate) {
187+
new.at_match = false;
188+
candidate.truncate(i);
189+
}
190+
}
191+
}
192+
new.literals.push(candidate);
193+
}
194+
new.literals.retain(|lit| !lit.is_empty());
195+
// This is OK only because the alternates are unambiguous.
196+
new.literals.sort();
197+
new.literals.dedup();
198+
new
199+
}
126200
}
127201

128202
pub struct BuildPrefixes<'a> {
@@ -324,20 +398,59 @@ enum LiteralMatcher {
324398
/// A single byte prefix.
325399
Byte(u8),
326400
/// A set of two or more single byte prefixes.
327-
/// This could be reduced to a bitset, which would use only 8 bytes,
328-
/// but I don't think we care.
329401
Bytes {
330402
chars: Vec<u8>,
331403
sparse: Vec<bool>,
332404
},
405+
/// A single substring. (Likely using Boyer-Moore with memchr.)
333406
Single(SingleSearch),
334-
/// A full Aho-Corasick DFA. A "full" DFA in this case means that all of
335-
/// the failure transitions have been expanded and the entire DFA is
336-
/// represented by a memory inefficient sparse matrix. This makes matching
337-
/// extremely fast. We only use this "full" DFA when the number of bytes
338-
/// in our literals does not exceed 250. This generally leads to a DFA that
339-
/// consumes 250KB of memory.
340-
FullAutomaton(FullAcAutomaton<Vec<u8>>),
407+
/// An Aho-Corasick automaton.
408+
AC(FullAcAutomaton<Vec<u8>>),
409+
}
410+
411+
impl LiteralMatcher {
412+
/// Create a new prefix matching machine.
413+
fn new(mut alts: AlternateLiterals) -> Self {
414+
use self::LiteralMatcher::*;
415+
416+
if alts.is_empty() {
417+
Empty
418+
} else if alts.distinct_single_bytes() >= 26 {
419+
// Why do we do this? Well, it's a heuristic to prevent thrashing.
420+
// Basically, if our literal matcher has lots of literals that are
421+
// a single byte, then we lose a lot of the benefits of fast
422+
// literal searching. In particular, single bytes have a high
423+
// probability of matching. In a regex that rarely matches, we end
424+
// up ping-ponging between the literal matcher and the regex engine
425+
// for every byte of input. That's bad juju.
426+
//
427+
// Note that we only count distinct starting bytes from literals of
428+
// length 1. For literals longer than that, we assume they have
429+
// a lower probability of matching.
430+
//
431+
// This particular heuristic would be triggered on, e.g.,
432+
// `[a-z].+`. The prefix here is a single byte that is very likely
433+
// to match on any given byte in the input, so it's quicker just
434+
// to let the matching engine process it.
435+
//
436+
// TODO(burntsushi): Consider lowering the threshold!
437+
Empty
438+
} else if alts.is_single_byte() {
439+
Byte(alts.literals[0][0])
440+
} else if alts.all_single_bytes() {
441+
let mut set = vec![false; 256];
442+
let mut bytes = vec![];
443+
for lit in alts.literals {
444+
bytes.push(lit[0]);
445+
set[lit[0] as usize] = true;
446+
}
447+
Bytes { chars: bytes, sparse: set }
448+
} else if alts.is_one_literal() {
449+
Single(SingleSearch::new(alts.literals.pop().unwrap()))
450+
} else {
451+
AC(AcAutomaton::new(alts.literals).into_full())
452+
}
453+
}
341454
}
342455

343456
impl Literals {
@@ -377,7 +490,7 @@ impl Literals {
377490
Single(ref searcher) => {
378491
searcher.find(haystack).map(|i| (i, i + searcher.pat.len()))
379492
}
380-
FullAutomaton(ref aut) => {
493+
AC(ref aut) => {
381494
aut.find(haystack).next().map(|m| (m.start, m.end))
382495
}
383496
}
@@ -396,34 +509,7 @@ impl Literals {
396509
Byte(_) => 1,
397510
Bytes { ref chars, .. } => chars.len(),
398511
Single(_) => 1,
399-
FullAutomaton(ref aut) => aut.len(),
400-
}
401-
}
402-
403-
/// Returns true iff the prefix match preserves priority.
404-
///
405-
/// For example, given the alternation `ab|a` and the target string `ab`,
406-
/// does the prefix machine guarantee that `ab` will match? (A full
407-
/// Aho-Corasick automaton does not!)
408-
pub fn preserves_priority(&self) -> bool {
409-
use self::LiteralMatcher::*;
410-
match self.matcher {
411-
Empty => true,
412-
Byte(_) => true,
413-
Bytes{..} => true,
414-
Single(_) => true,
415-
FullAutomaton(ref aut) => {
416-
// Okay, so the automaton can respect priority in one
417-
// particular case: when every pattern is of the same length.
418-
// The trick is that the automaton will report the leftmost
419-
// match, which in this case, corresponds to the correct
420-
// match for the regex engine. If any other alternate matches
421-
// at the same position, then they must be exactly equivalent.
422-
423-
// Guaranteed at least one prefix by construction, so use
424-
// that for the length.
425-
aut.patterns().iter().all(|p| p.len() == aut.pattern(0).len())
426-
}
512+
AC(ref aut) => aut.len(),
427513
}
428514
}
429515

@@ -440,7 +526,7 @@ impl Literals {
440526
(single.pat.len() * mem::size_of::<u8>())
441527
+ (single.shift.len() * mem::size_of::<usize>())
442528
}
443-
FullAutomaton(ref aut) => aut.heap_bytes(),
529+
AC(ref aut) => aut.heap_bytes(),
444530
}
445531
}
446532

@@ -465,52 +551,7 @@ impl Literals {
465551
chars.iter().map(|&byte| vec![byte]).collect()
466552
}
467553
Single(ref searcher) => vec![searcher.pat.clone()],
468-
FullAutomaton(ref aut) => aut.patterns().iter().cloned().collect(),
469-
}
470-
}
471-
}
472-
473-
impl LiteralMatcher {
474-
/// Create a new prefix matching machine.
475-
fn new(mut alts: AlternateLiterals) -> Self {
476-
use self::LiteralMatcher::*;
477-
478-
if alts.is_empty() {
479-
Empty
480-
} else if alts.distinct_single_bytes() >= 26 {
481-
// Why do we do this? Well, it's a heuristic to prevent thrashing.
482-
// Basically, if our literal matcher has lots of literals that are
483-
// a single byte, then we lose a lot of the benefits of fast
484-
// literal searching. In particular, single bytes have a high
485-
// probability of matching. In a regex that rarely matches, we end
486-
// up ping-ponging between the literal matcher and the regex engine
487-
// for every byte of input. That's bad juju.
488-
//
489-
// Note that we only count distinct starting bytes from literals of
490-
// length 1. For literals longer than that, we assume they have
491-
// a lower probability of matching.
492-
//
493-
// This particular heuristic would be triggered on, e.g.,
494-
// `[a-z].+`. The prefix here is a single byte that is very likely
495-
// to match on any given byte in the input, so it's quicker just
496-
// to let the matching engine process it.
497-
//
498-
// TODO(burntsushi): Consider lowering the threshold!
499-
Empty
500-
} else if alts.is_single_byte() {
501-
Byte(alts.literals[0][0])
502-
} else if alts.all_single_bytes() {
503-
let mut set = vec![false; 256];
504-
let mut bytes = vec![];
505-
for lit in alts.literals {
506-
bytes.push(lit[0]);
507-
set[lit[0] as usize] = true;
508-
}
509-
Bytes { chars: bytes, sparse: set }
510-
} else if alts.is_one_literal() {
511-
Single(SingleSearch::new(alts.literals.pop().unwrap()))
512-
} else {
513-
FullAutomaton(AcAutomaton::new(alts.literals).into_full())
554+
AC(ref aut) => aut.patterns().iter().cloned().collect(),
514555
}
515556
}
516557
}
@@ -586,6 +627,19 @@ fn find_singles(
586627
None
587628
}
588629

630+
impl fmt::Debug for AlternateLiterals {
631+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
632+
let mut strings = vec![];
633+
for lit in &self.literals {
634+
strings.push(String::from_utf8_lossy(lit).into_owned());
635+
}
636+
f.debug_struct("AlternateLiterals")
637+
.field("at_match", &self.at_match)
638+
.field("literals", &strings)
639+
.finish()
640+
}
641+
}
642+
589643
impl fmt::Debug for Literals {
590644
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
591645
use self::LiteralMatcher::*;
@@ -601,14 +655,15 @@ impl fmt::Debug for Literals {
601655
write!(f, "alternate single bytes: {}", chars.join(", "))
602656
}
603657
Single(ref searcher) => write!(f, "{:?}", searcher),
604-
FullAutomaton(ref aut) => write!(f, "{:?}", aut),
658+
AC(ref aut) => write!(f, "{:?}", aut),
605659
}
606660
}
607661
}
608662

609663
#[cfg(test)]
610664
mod tests {
611665
use program::ProgramBuilder;
666+
use super::AlternateLiterals;
612667

613668
macro_rules! prog {
614669
($re:expr) => { ProgramBuilder::new($re).compile().unwrap() }
@@ -702,6 +757,46 @@ mod tests {
702757
assert_eq!(prefixes_complete!("☃"), vec!["☃"]);
703758
}
704759

760+
macro_rules! alts {
761+
($($s:expr),*) => {{
762+
AlternateLiterals {
763+
at_match: false,
764+
literals: vec![$($s.as_bytes().to_owned()),*],
765+
}
766+
}}
767+
}
768+
769+
#[test]
770+
fn unambiguous() {
771+
let given = alts!("z", "azb");
772+
let expected = alts!("a", "z");
773+
assert_eq!(expected, given.unambiguous());
774+
775+
let given = alts!("zaaaaaaaaaa", "aa");
776+
let expected = alts!("aa", "z");
777+
assert_eq!(expected, given.unambiguous());
778+
779+
let given = alts!("Sherlock", "Watson");
780+
let expected = alts!("Sherlock", "Watson");
781+
assert_eq!(expected, given.unambiguous());
782+
783+
let given = alts!("abc", "bc");
784+
let expected = alts!("a", "bc");
785+
assert_eq!(expected, given.unambiguous());
786+
787+
let given = alts!("bc", "abc");
788+
let expected = alts!("a", "bc");
789+
assert_eq!(expected, given.unambiguous());
790+
791+
let given = alts!("a", "aa");
792+
let expected = alts!("a");
793+
assert_eq!(expected, given.unambiguous());
794+
795+
let given = alts!("ab", "a");
796+
let expected = alts!("a");
797+
assert_eq!(expected, given.unambiguous());
798+
}
799+
705800
// That this test fails suggests that the literal finder needs to be
706801
// completely rewritten. Ug. It's not that it is wrong currently, but
707802
// it's not as good at finding literals as it should be.

src/program.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ impl Program {
134134
/// If this returns true, then it is possible to avoid running any of the
135135
/// NFA or DFA based matching engines entirely.
136136
pub fn is_prefix_match(&self) -> bool {
137-
self.prefixes.at_match() && self.prefixes.preserves_priority()
137+
self.prefixes.at_match()
138138
}
139139

140140
/// Returns true if the underlying program is reversed.

tests/tests.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,9 @@ mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3)));
500500

501501
mat!(one_unicode, r"☃", "☃", Some((0, 3)));
502502

503+
// Regression test for https://github.com/rust-lang-nursery/regex/issues/169
504+
mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3)));
505+
503506
// A whole mess of tests from Glenn Fowler's regex test suite.
504507
// Generated by the 'src/etc/regex-match-tests' program.
505508
#[path = "matches.rs"]

0 commit comments

Comments
 (0)