From 8d764eacb19eacf721b2c6340e451025f3f15e56 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 28 Dec 2016 23:42:42 -0500 Subject: [PATCH] Compute word boundary flags in start state. At some point, I think I had convinced myself that we didn't need to compute word boundary flags for the initial state, but it turns out that we do. Fixes #271 --- src/dfa.rs | 25 ++++++++++++++++++++++--- tests/regression.rs | 7 +++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/dfa.rs b/src/dfa.rs index c453d63d72..d3c007c350 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -1378,7 +1378,9 @@ impl<'a> Fsm<'a> { ((empty_flags.end as u8) << 1) | ((empty_flags.start_line as u8) << 2) | ((empty_flags.end_line as u8) << 3) | - ((state_flags.is_word() as u8) << 4)) + ((empty_flags.word_boundary as u8) << 4) | + ((empty_flags.not_word_boundary as u8) << 5) | + ((state_flags.is_word() as u8) << 6)) as usize }; match self.cache.start_states[flagi] { @@ -1412,9 +1414,17 @@ impl<'a> Fsm<'a> { empty_flags.end = text.len() == 0; empty_flags.start_line = at == 0 || text[at - 1] == b'\n'; empty_flags.end_line = text.len() == 0; - if at > 0 && Byte::byte(text[at - 1]).is_ascii_word() { + + let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word(); + let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word(); + if is_word_last { state_flags.set_word(); } + if is_word == is_word_last { + empty_flags.not_word_boundary = true; + } else { + empty_flags.word_boundary = true; + } (empty_flags, state_flags) } @@ -1433,9 +1443,18 @@ impl<'a> Fsm<'a> { empty_flags.end = text.len() == 0; empty_flags.start_line = at == text.len() || text[at] == b'\n'; empty_flags.end_line = text.len() == 0; - if at < text.len() && Byte::byte(text[at]).is_ascii_word() { + + let is_word_last = + at < text.len() && Byte::byte(text[at]).is_ascii_word(); + let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word(); + if is_word_last { state_flags.set_word(); } + if is_word == is_word_last { + empty_flags.not_word_boundary = true; + } else { + empty_flags.word_boundary = true; + } (empty_flags, state_flags) } diff --git a/tests/regression.rs b/tests/regression.rs index d2732467cf..05717ea1bc 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -75,3 +75,10 @@ mat!(lits_unambiguous1, u!(r"(ABC|CDA|BC)X"), "CDAX", Some((0, 4))); // See: https://github.com/rust-lang-nursery/regex/issues/291 mat!(lits_unambiguous2, u!(r"((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P[0-9]+)$"), "CIMG2341", Some((0, 8)), Some((0, 4)), None, Some((0, 4)), Some((4, 8))); + +// See: https://github.com/rust-lang-nursery/regex/issues/271 +mat!(end_not_wb, u!(r"$(?-u:\B)"), "\u{5c124}\u{b576c}", Some((8, 8))); +mat!(endl_or_wb, u!(r"(?m:$)|(?-u:\b)"), "\u{6084e}", Some((4, 4))); +mat!(zero_or_end, u!(r"(?i-u:\x00)|$"), "\u{e682f}", Some((4, 4))); +mat!(y_or_endl, u!(r"(?i-u:y)|(?m:$)"), "\u{b4331}", Some((4, 4))); +mat!(wb_start_x, u!(r"(?u:\b)^(?-u:X)"), "X", Some((0, 1)));