Fixes #169.

BurntSushi · BurntSushi · commit 55a1fc9687d4 · 2016-02-18T19:38:49.000-05:00
This fixes an issue where prefix literals could contain alternates such
that an Aho-Corasick match could be ambiguous. More specifically, AC
returns matches in the order in which they appear in the text rather than
specifically "leftmost first." This means that if an alternate is a
substring of another alternate, then AC will return a match for that first
even if the other alternate is left of it. In essence, this causes a prefix
scan to skip too far ahead in the text.

We fix this by ensuring that prefix literals are unambiguous by truncating
all literals such that none of them are substrings of another. This is
written in a way that is somewhat expensive, but we constrain the number
of literals greatly, so the quadratic runtime shouldn't be too noticeable.
diff --git a/src/literals.rs b/src/literals.rs
@@ -19,6 +19,7 @@ use memchr::{memchr, memchr2, memchr3};
 use char_utf8::encode_utf8;
 use inst::{Insts, Inst, InstBytes, InstRanges};
 
+#[derive(Clone, Eq, PartialEq)]
 pub struct AlternateLiterals {
     at_match: bool,
     literals: Vec<Vec<u8>>,
@@ -29,10 +30,11 @@ impl AlternateLiterals {
         if self.literals.is_empty() {
             Literals::empty()
         } else {
-            let at_match = self.at_match;
+            let alts = self.unambiguous();
+            let at_match = alts.at_match;
             Literals {
                 at_match: at_match,
-                matcher: LiteralMatcher::new(self),
+                matcher: LiteralMatcher::new(alts),
             }
         }
     }
@@ -53,6 +55,13 @@ impl AlternateLiterals {
         self.literals.len() >= 1 && self.literals.iter().all(|s| s.len() == 1)
     }
 
+    fn all_same_length(&self) -> bool {
+        if self.literals.is_empty() {
+            return true;
+        }
+        self.literals.iter().all(|s| s.len() == self.literals[0].len())
+    }
+
     fn is_one_literal(&self) -> bool {
         self.literals.len() == 1
     }
@@ -123,6 +132,71 @@ impl AlternateLiterals {
             }
         }
     }
+
+    /// Returns a new set of alternate literals that are guaranteed to be
+    /// unambiguous.
+    ///
+    /// State differently, the set of literals returned is guaranteed to never
+    /// result in any overlapping matches.
+    ///
+    /// Duplicate literals are dropped. Literals that are otherwise distinct
+    /// will be possibly truncated.
+    fn unambiguous(&self) -> AlternateLiterals {
+        fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
+            let mut i = 0;
+            while haystack.len() >= needle.len() {
+                if needle == &haystack[..needle.len()] {
+                    return Some(i);
+                }
+                i += 1;
+                haystack = &haystack[1..];
+            }
+            None
+        }
+
+        // This function is a bit gratuitous and allocation heavy, but in
+        // general, we limit the number of alternates to be pretty small.
+
+        if self.all_same_length() {
+            return self.clone();
+        }
+        let mut new = AlternateLiterals {
+            at_match: self.at_match,
+            literals: Vec::with_capacity(self.literals.len()),
+        };
+'OUTER:
+        for lit1 in &self.literals {
+            if new.literals.is_empty() {
+                new.literals.push(lit1.clone());
+                continue;
+            }
+            let mut candidate = lit1.clone();
+            for lit2 in &mut new.literals {
+                if &candidate == lit2 {
+                    // If the literal is already in the set, then we can
+                    // just drop it.
+                    continue 'OUTER;
+                }
+                if lit1.len() <= lit2.len() {
+                    if let Some(i) = position(&candidate, lit2) {
+                        new.at_match = false;
+                        lit2.truncate(i);
+                    }
+                } else {
+                    if let Some(i) = position(lit2, &candidate) {
+                        new.at_match = false;
+                        candidate.truncate(i);
+                    }
+                }
+            }
+            new.literals.push(candidate);
+        }
+        new.literals.retain(|lit| !lit.is_empty());
+        // This is OK only because the alternates are unambiguous.
+        new.literals.sort();
+        new.literals.dedup();
+        new
+    }
 }
 
 pub struct BuildPrefixes<'a> {
@@ -324,20 +398,59 @@ enum LiteralMatcher {
     /// A single byte prefix.
     Byte(u8),
     /// A set of two or more single byte prefixes.
-    /// This could be reduced to a bitset, which would use only 8 bytes,
-    /// but I don't think we care.
     Bytes {
         chars: Vec<u8>,
         sparse: Vec<bool>,
     },
+    /// A single substring. (Likely using Boyer-Moore with memchr.)
     Single(SingleSearch),
-    /// A full Aho-Corasick DFA. A "full" DFA in this case means that all of
-    /// the failure transitions have been expanded and the entire DFA is
-    /// represented by a memory inefficient sparse matrix. This makes matching
-    /// extremely fast. We only use this "full" DFA when the number of bytes
-    /// in our literals does not exceed 250. This generally leads to a DFA that
-    /// consumes 250KB of memory.
-    FullAutomaton(FullAcAutomaton<Vec<u8>>),
+    /// An Aho-Corasick automaton.
+    AC(FullAcAutomaton<Vec<u8>>),
+}
+
+impl LiteralMatcher {
+    /// Create a new prefix matching machine.
+    fn new(mut alts: AlternateLiterals) -> Self {
+        use self::LiteralMatcher::*;
+
+        if alts.is_empty() {
+            Empty
+        } else if alts.distinct_single_bytes() >= 26 {
+            // Why do we do this? Well, it's a heuristic to prevent thrashing.
+            // Basically, if our literal matcher has lots of literals that are
+            // a single byte, then we lose a lot of the benefits of fast
+            // literal searching. In particular, single bytes have a high
+            // probability of matching. In a regex that rarely matches, we end
+            // up ping-ponging between the literal matcher and the regex engine
+            // for every byte of input. That's bad juju.
+            //
+            // Note that we only count distinct starting bytes from literals of
+            // length 1. For literals longer than that, we assume they have
+            // a lower probability of matching.
+            //
+            // This particular heuristic would be triggered on, e.g.,
+            // `[a-z].+`. The prefix here is a single byte that is very likely
+            // to match on any given byte in the input, so it's quicker just
+            // to let the matching engine process it.
+            //
+            // TODO(burntsushi): Consider lowering the threshold!
+            Empty
+        } else if alts.is_single_byte() {
+            Byte(alts.literals[0][0])
+        } else if alts.all_single_bytes() {
+            let mut set = vec![false; 256];
+            let mut bytes = vec![];
+            for lit in alts.literals {
+                bytes.push(lit[0]);
+                set[lit[0] as usize] = true;
+            }
+            Bytes { chars: bytes, sparse: set }
+        } else if alts.is_one_literal() {
+            Single(SingleSearch::new(alts.literals.pop().unwrap()))
+        } else {
+            AC(AcAutomaton::new(alts.literals).into_full())
+        }
+    }
 }
 
 impl Literals {
@@ -377,7 +490,7 @@ impl Literals {
             Single(ref searcher) => {
                 searcher.find(haystack).map(|i| (i, i + searcher.pat.len()))
             }
-            FullAutomaton(ref aut) => {
+            AC(ref aut) => {
                 aut.find(haystack).next().map(|m| (m.start, m.end))
             }
         }
@@ -396,34 +509,7 @@ impl Literals {
             Byte(_) => 1,
             Bytes { ref chars, .. } => chars.len(),
             Single(_) => 1,
-            FullAutomaton(ref aut) => aut.len(),
-        }
-    }
-
-    /// Returns true iff the prefix match preserves priority.
-    ///
-    /// For example, given the alternation `ab|a` and the target string `ab`,
-    /// does the prefix machine guarantee that `ab` will match? (A full
-    /// Aho-Corasick automaton does not!)
-    pub fn preserves_priority(&self) -> bool {
-        use self::LiteralMatcher::*;
-        match self.matcher {
-            Empty => true,
-            Byte(_) => true,
-            Bytes{..} => true,
-            Single(_) => true,
-            FullAutomaton(ref aut) => {
-                // Okay, so the automaton can respect priority in one
-                // particular case: when every pattern is of the same length.
-                // The trick is that the automaton will report the leftmost
-                // match, which in this case, corresponds to the correct
-                // match for the regex engine. If any other alternate matches
-                // at the same position, then they must be exactly equivalent.
-
-                // Guaranteed at least one prefix by construction, so use
-                // that for the length.
-                aut.patterns().iter().all(|p| p.len() == aut.pattern(0).len())
-            }
+            AC(ref aut) => aut.len(),
         }
     }
 
@@ -440,7 +526,7 @@ impl Literals {
                 (single.pat.len() * mem::size_of::<u8>())
                 + (single.shift.len() * mem::size_of::<usize>())
             }
-            FullAutomaton(ref aut) => aut.heap_bytes(),
+            AC(ref aut) => aut.heap_bytes(),
         }
     }
 
@@ -465,52 +551,7 @@ impl Literals {
                 chars.iter().map(|&byte| vec![byte]).collect()
             }
             Single(ref searcher) => vec![searcher.pat.clone()],
-            FullAutomaton(ref aut) => aut.patterns().iter().cloned().collect(),
-        }
-    }
-}
-
-impl LiteralMatcher {
-    /// Create a new prefix matching machine.
-    fn new(mut alts: AlternateLiterals) -> Self {
-        use self::LiteralMatcher::*;
-
-        if alts.is_empty() {
-            Empty
-        } else if alts.distinct_single_bytes() >= 26 {
-            // Why do we do this? Well, it's a heuristic to prevent thrashing.
-            // Basically, if our literal matcher has lots of literals that are
-            // a single byte, then we lose a lot of the benefits of fast
-            // literal searching. In particular, single bytes have a high
-            // probability of matching. In a regex that rarely matches, we end
-            // up ping-ponging between the literal matcher and the regex engine
-            // for every byte of input. That's bad juju.
-            //
-            // Note that we only count distinct starting bytes from literals of
-            // length 1. For literals longer than that, we assume they have
-            // a lower probability of matching.
-            //
-            // This particular heuristic would be triggered on, e.g.,
-            // `[a-z].+`. The prefix here is a single byte that is very likely
-            // to match on any given byte in the input, so it's quicker just
-            // to let the matching engine process it.
-            //
-            // TODO(burntsushi): Consider lowering the threshold!
-            Empty
-        } else if alts.is_single_byte() {
-            Byte(alts.literals[0][0])
-        } else if alts.all_single_bytes() {
-            let mut set = vec![false; 256];
-            let mut bytes = vec![];
-            for lit in alts.literals {
-                bytes.push(lit[0]);
-                set[lit[0] as usize] = true;
-            }
-            Bytes { chars: bytes, sparse: set }
-        } else if alts.is_one_literal() {
-            Single(SingleSearch::new(alts.literals.pop().unwrap()))
-        } else {
-            FullAutomaton(AcAutomaton::new(alts.literals).into_full())
+            AC(ref aut) => aut.patterns().iter().cloned().collect(),
         }
     }
 }
@@ -586,6 +627,19 @@ fn find_singles(
     None
 }
 
+impl fmt::Debug for AlternateLiterals {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let mut strings = vec![];
+        for lit in &self.literals {
+            strings.push(String::from_utf8_lossy(lit).into_owned());
+        }
+        f.debug_struct("AlternateLiterals")
+         .field("at_match", &self.at_match)
+         .field("literals", &strings)
+         .finish()
+    }
+}
+
 impl fmt::Debug for Literals {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         use self::LiteralMatcher::*;
@@ -601,14 +655,15 @@ impl fmt::Debug for Literals {
                 write!(f, "alternate single bytes: {}", chars.join(", "))
             }
             Single(ref searcher) => write!(f, "{:?}", searcher),
-            FullAutomaton(ref aut) => write!(f, "{:?}", aut),
+            AC(ref aut) => write!(f, "{:?}", aut),
         }
     }
 }
 
 #[cfg(test)]
 mod tests {
     use program::ProgramBuilder;
+    use super::AlternateLiterals;
 
     macro_rules! prog {
         ($re:expr) => { ProgramBuilder::new($re).compile().unwrap() }
@@ -702,6 +757,46 @@ mod tests {
         assert_eq!(prefixes_complete!("☃"), vec!["☃"]);
     }
 
+    macro_rules! alts {
+        ($($s:expr),*) => {{
+            AlternateLiterals {
+                at_match: false,
+                literals: vec![$($s.as_bytes().to_owned()),*],
+            }
+        }}
+    }
+
+    #[test]
+    fn unambiguous() {
+        let given = alts!("z", "azb");
+        let expected = alts!("a", "z");
+        assert_eq!(expected, given.unambiguous());
+
+        let given = alts!("zaaaaaaaaaa", "aa");
+        let expected = alts!("aa", "z");
+        assert_eq!(expected, given.unambiguous());
+
+        let given = alts!("Sherlock", "Watson");
+        let expected = alts!("Sherlock", "Watson");
+        assert_eq!(expected, given.unambiguous());
+
+        let given = alts!("abc", "bc");
+        let expected = alts!("a", "bc");
+        assert_eq!(expected, given.unambiguous());
+
+        let given = alts!("bc", "abc");
+        let expected = alts!("a", "bc");
+        assert_eq!(expected, given.unambiguous());
+
+        let given = alts!("a", "aa");
+        let expected = alts!("a");
+        assert_eq!(expected, given.unambiguous());
+
+        let given = alts!("ab", "a");
+        let expected = alts!("a");
+        assert_eq!(expected, given.unambiguous());
+    }
+
     // That this test fails suggests that the literal finder needs to be
     // completely rewritten. Ug. It's not that it is wrong currently, but
     // it's not as good at finding literals as it should be.
diff --git a/src/program.rs b/src/program.rs
@@ -134,7 +134,7 @@ impl Program {
     /// If this returns true, then it is possible to avoid running any of the
     /// NFA or DFA based matching engines entirely.
     pub fn is_prefix_match(&self) -> bool {
-        self.prefixes.at_match() && self.prefixes.preserves_priority()
+        self.prefixes.at_match()
     }
 
     /// Returns true if the underlying program is reversed.
diff --git a/tests/tests.rs b/tests/tests.rs
@@ -500,6 +500,9 @@ mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3)));
 
 mat!(one_unicode, r"☃", "☃", Some((0, 3)));
 
+// Regression test for https://github.com/rust-lang-nursery/regex/issues/169
+mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3)));
+
 // A whole mess of tests from Glenn Fowler's regex test suite.
 // Generated by the 'src/etc/regex-match-tests' program.
 #[path = "matches.rs"]

Original file line number	Diff line number	Diff line change
`@@ -134,7 +134,7 @@ impl Program {`
`134`	`134`	`/// If this returns true, then it is possible to avoid running any of the`
`135`	`135`	`/// NFA or DFA based matching engines entirely.`
`136`	`136`	`pub fn is_prefix_match(&self) -> bool {`
`137`		`- self.prefixes.at_match() && self.prefixes.preserves_priority()`
	`137`	`+ self.prefixes.at_match()`
`138`	`138`	`}`
`139`	`139`
`140`	`140`	`/// Returns true if the underlying program is reversed.`