Skip to content

Map char ranges for case-insensitivity at parse time. #77

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions regex_macros/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
} else {
quote_expr!(self.cx, found)
};
let mranges = self.match_class(casei, &ranges);
let mranges = self.match_class(&ranges);
quote_expr!(self.cx, {
if self.chars.prev.is_some() {
let c = $get_char;
Expand Down Expand Up @@ -529,12 +529,8 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
// Translates a character class into a match expression.
// This avoids a binary search (and is hopefully replaced by a jump
// table).
fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> P<ast::Expr> {
let mut arms = ranges.iter().map(|&(mut start, mut end)| {
if casei {
start = start.to_uppercase().next().unwrap();
end = end.to_uppercase().next().unwrap();
}
fn match_class(&self, ranges: &[(char, char)]) -> P<ast::Expr> {
let mut arms = ranges.iter().map(|&(start, end)| {
let pat = self.cx.pat(self.sp, ast::PatRange(quote_expr!(self.cx, $start),
quote_expr!(self.cx, $end)));
self.cx.arm(self.sp, vec!(pat), quote_expr!(self.cx, true))
Expand Down
4 changes: 4 additions & 0 deletions regex_macros/tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,10 @@ mat!(negclass_space_comma, r"[^,\s]", ", a", Some((2, 3)));
mat!(negclass_comma_space, r"[^\s,]", " ,a", Some((2, 3)));
mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2)));

// Regression test for https://github.com/rust-lang/regex/issues/75
mat!(regression_unsorted_binary_search_1, r"(?i)[a_]+", "A_", Some((0, 2)));
mat!(regression_unsorted_binary_search_2, r"(?i)[A_]+", "a_", Some((0, 2)));

// A whole mess of tests from Glenn Fowler's regex test suite.
// Generated by the 'src/etc/regex-match-tests' program.
#[path = "matches.rs"]
Expand Down
7 changes: 7 additions & 0 deletions src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,13 @@ impl Parser {
}
}
']' if ranges.len() > 0 => {
if self.flags & FLAG_NOCASE > 0 {
// FIMXE(https://github.com/rust-lang/regex/issues/76): This is wrong.
for range in &mut ranges {
range.0 = range.0.to_uppercase().next().unwrap();
range.1 = range.1.to_uppercase().next().unwrap();
}
}
ranges = combine_ranges(ranges);
if negated {
ranges = invert_ranges(ranges);
Expand Down
29 changes: 6 additions & 23 deletions src/vm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -229,12 +229,12 @@ impl<'r, 't> Nfa<'r, 't> {
}
}
CharClass(ref ranges, flags) => {
if let Some(c) = self.chars.prev {
if let Some(mut c) = self.chars.prev {
let negate = flags & FLAG_NEGATED > 0;
let casei = flags & FLAG_NOCASE > 0;
let found =
ranges.binary_search_by(|&rc| class_cmp(casei, c, rc))
.is_ok();
if flags & FLAG_NOCASE > 0 {
c = c.to_uppercase().next().unwrap();
}
let found = ranges.binary_search_by(|&rc| class_cmp(c, rc)).is_ok();
if found ^ negate {
self.add(nlist, pc+1, caps);
}
Expand Down Expand Up @@ -540,26 +540,9 @@ pub fn is_word(c: Option<char>) -> bool {
/// indicating whether the character is less than the start of the range,
/// in the range (inclusive) or greater than the end of the range.
///
/// If `casei` is `true`, then this ordering is computed case insensitively.
///
/// This function is meant to be used with a binary search.
#[inline]
fn class_cmp(casei: bool, mut textc: char,
(mut start, mut end): (char, char)) -> Ordering {
if casei {
// FIXME: This is pretty ridiculous. All of this case conversion
// can be moved outside this function:
// 1) textc should be uppercased outside the bsearch.
// 2) the character class itself should be uppercased either in the
// parser or the compiler.
// FIXME: This is too simplistic for correct Unicode support.
// See also: char_eq
// FIXME: Standard library now yields iterators, so we should take
// advantage of them.
textc = textc.to_uppercase().next().unwrap();
start = start.to_uppercase().next().unwrap();
end = end.to_uppercase().next().unwrap();
}
fn class_cmp(textc: char, (start, end): (char, char)) -> Ordering {
if textc >= start && textc <= end {
Ordering::Equal
} else if start > textc {
Expand Down