Skip to content

Commit 32974d5

Browse files
committed
Fix #75: map char ranges for case-insensitivity at parse time.
1 parent 399758a commit 32974d5

File tree

4 files changed

+20
-30
lines changed

4 files changed

+20
-30
lines changed

regex_macros/src/lib.rs

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -494,7 +494,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
494494
} else {
495495
quote_expr!(self.cx, found)
496496
};
497-
let mranges = self.match_class(casei, &ranges);
497+
let mranges = self.match_class(&ranges);
498498
quote_expr!(self.cx, {
499499
if self.chars.prev.is_some() {
500500
let c = $get_char;
@@ -529,12 +529,8 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
529529
// Translates a character class into a match expression.
530530
// This avoids a binary search (and is hopefully replaced by a jump
531531
// table).
532-
fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> P<ast::Expr> {
533-
let mut arms = ranges.iter().map(|&(mut start, mut end)| {
534-
if casei {
535-
start = start.to_uppercase().next().unwrap();
536-
end = end.to_uppercase().next().unwrap();
537-
}
532+
fn match_class(&self, ranges: &[(char, char)]) -> P<ast::Expr> {
533+
let mut arms = ranges.iter().map(|&(start, end)| {
538534
let pat = self.cx.pat(self.sp, ast::PatRange(quote_expr!(self.cx, $start),
539535
quote_expr!(self.cx, $end)));
540536
self.cx.arm(self.sp, vec!(pat), quote_expr!(self.cx, true))

regex_macros/tests/tests.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,10 @@ mat!(negclass_space_comma, r"[^,\s]", ", a", Some((2, 3)));
355355
mat!(negclass_comma_space, r"[^\s,]", " ,a", Some((2, 3)));
356356
mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2)));
357357

358+
// Regression test for https://github.com/rust-lang/regex/issues/75
359+
mat!(regression_unsorted_binary_search_1, r"(?i)[a_]+", "A_", Some((0, 2)));
360+
mat!(regression_unsorted_binary_search_2, r"(?i)[A_]+", "a_", Some((0, 2)));
361+
358362
// A whole mess of tests from Glenn Fowler's regex test suite.
359363
// Generated by the 'src/etc/regex-match-tests' program.
360364
#[path = "matches.rs"]

src/parse.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,13 @@ impl Parser {
421421
}
422422
}
423423
']' if ranges.len() > 0 => {
424+
if self.flags & FLAG_NOCASE > 0 {
425+
// FIMXE(https://github.com/rust-lang/regex/issues/76): This is wrong.
426+
for range in &mut ranges {
427+
range.0 = range.0.to_uppercase().next().unwrap();
428+
range.1 = range.1.to_uppercase().next().unwrap();
429+
}
430+
}
424431
ranges = combine_ranges(ranges);
425432
if negated {
426433
ranges = invert_ranges(ranges);

src/vm.rs

Lines changed: 6 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -229,12 +229,12 @@ impl<'r, 't> Nfa<'r, 't> {
229229
}
230230
}
231231
CharClass(ref ranges, flags) => {
232-
if let Some(c) = self.chars.prev {
232+
if let Some(mut c) = self.chars.prev {
233233
let negate = flags & FLAG_NEGATED > 0;
234-
let casei = flags & FLAG_NOCASE > 0;
235-
let found =
236-
ranges.binary_search_by(|&rc| class_cmp(casei, c, rc))
237-
.is_ok();
234+
if flags & FLAG_NOCASE > 0 {
235+
c = c.to_uppercase().next().unwrap();
236+
}
237+
let found = ranges.binary_search_by(|&rc| class_cmp(c, rc)).is_ok();
238238
if found ^ negate {
239239
self.add(nlist, pc+1, caps);
240240
}
@@ -540,26 +540,9 @@ pub fn is_word(c: Option<char>) -> bool {
540540
/// indicating whether the character is less than the start of the range,
541541
/// in the range (inclusive) or greater than the end of the range.
542542
///
543-
/// If `casei` is `true`, then this ordering is computed case insensitively.
544-
///
545543
/// This function is meant to be used with a binary search.
546544
#[inline]
547-
fn class_cmp(casei: bool, mut textc: char,
548-
(mut start, mut end): (char, char)) -> Ordering {
549-
if casei {
550-
// FIXME: This is pretty ridiculous. All of this case conversion
551-
// can be moved outside this function:
552-
// 1) textc should be uppercased outside the bsearch.
553-
// 2) the character class itself should be uppercased either in the
554-
// parser or the compiler.
555-
// FIXME: This is too simplistic for correct Unicode support.
556-
// See also: char_eq
557-
// FIXME: Standard library now yields iterators, so we should take
558-
// advantage of them.
559-
textc = textc.to_uppercase().next().unwrap();
560-
start = start.to_uppercase().next().unwrap();
561-
end = end.to_uppercase().next().unwrap();
562-
}
545+
fn class_cmp(textc: char, (start, end): (char, char)) -> Ordering {
563546
if textc >= start && textc <= end {
564547
Ordering::Equal
565548
} else if start > textc {

0 commit comments

Comments
 (0)