rust-lang · SimonSapin · Apr 19, 2015
diff --git a/regex_macros/src/lib.rs b/regex_macros/src/lib.rs
@@ -494,7 +494,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
                         } else {
                             quote_expr!(self.cx, found)
                         };
-                    let mranges = self.match_class(casei, &ranges);
+                    let mranges = self.match_class(&ranges);
                     quote_expr!(self.cx, {
                         if self.chars.prev.is_some() {
                             let c = $get_char;
@@ -529,12 +529,8 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
     // Translates a character class into a match expression.
     // This avoids a binary search (and is hopefully replaced by a jump
     // table).
-    fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> P<ast::Expr> {
-        let mut arms = ranges.iter().map(|&(mut start, mut end)| {
-            if casei {
-                start = start.to_uppercase().next().unwrap();
-                end = end.to_uppercase().next().unwrap();
-            }
+    fn match_class(&self, ranges: &[(char, char)]) -> P<ast::Expr> {
+        let mut arms = ranges.iter().map(|&(start, end)| {
             let pat = self.cx.pat(self.sp, ast::PatRange(quote_expr!(self.cx, $start),
                                                          quote_expr!(self.cx, $end)));
             self.cx.arm(self.sp, vec!(pat), quote_expr!(self.cx, true))

diff --git a/regex_macros/tests/tests.rs b/regex_macros/tests/tests.rs
@@ -355,6 +355,10 @@ mat!(negclass_space_comma, r"[^,\s]", ", a", Some((2, 3)));
 mat!(negclass_comma_space, r"[^\s,]", " ,a", Some((2, 3)));
 mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2)));
 
+// Regression test for https://github.com/rust-lang/regex/issues/75
+mat!(regression_unsorted_binary_search_1, r"(?i)[a_]+", "A_", Some((0, 2)));
+mat!(regression_unsorted_binary_search_2, r"(?i)[A_]+", "a_", Some((0, 2)));
+
 // A whole mess of tests from Glenn Fowler's regex test suite.
 // Generated by the 'src/etc/regex-match-tests' program.
 #[path = "matches.rs"]

diff --git a/src/parse.rs b/src/parse.rs
@@ -421,6 +421,13 @@ impl Parser {
                     }
                 }
                 ']' if ranges.len() > 0 => {
+                    if self.flags & FLAG_NOCASE > 0 {
+                        // FIMXE(https://github.com/rust-lang/regex/issues/76): This is wrong.
+                        for range in &mut ranges {
+                            range.0 = range.0.to_uppercase().next().unwrap();
+                            range.1 = range.1.to_uppercase().next().unwrap();
+                        }
+                    }
                     ranges = combine_ranges(ranges);
                     if negated {
                         ranges = invert_ranges(ranges);

diff --git a/src/vm.rs b/src/vm.rs
@@ -229,12 +229,12 @@ impl<'r, 't> Nfa<'r, 't> {
                 }
             }
             CharClass(ref ranges, flags) => {
-                if let Some(c) = self.chars.prev {
+                if let Some(mut c) = self.chars.prev {
                     let negate = flags & FLAG_NEGATED > 0;
-                    let casei = flags & FLAG_NOCASE > 0;
-                    let found =
-                        ranges.binary_search_by(|&rc| class_cmp(casei, c, rc))
-                              .is_ok();
+                    if flags & FLAG_NOCASE > 0 {
+                        c = c.to_uppercase().next().unwrap();
+                    }
+                    let found = ranges.binary_search_by(|&rc| class_cmp(c, rc)).is_ok();
                     if found ^ negate {
                         self.add(nlist, pc+1, caps);
                     }
@@ -540,26 +540,9 @@ pub fn is_word(c: Option<char>) -> bool {
 /// indicating whether the character is less than the start of the range,
 /// in the range (inclusive) or greater than the end of the range.
 ///
-/// If `casei` is `true`, then this ordering is computed case insensitively.
-///
 /// This function is meant to be used with a binary search.
 #[inline]
-fn class_cmp(casei: bool, mut textc: char,
-             (mut start, mut end): (char, char)) -> Ordering {
-    if casei {
-        // FIXME: This is pretty ridiculous. All of this case conversion
-        // can be moved outside this function:
-        // 1) textc should be uppercased outside the bsearch.
-        // 2) the character class itself should be uppercased either in the
-        //    parser or the compiler.
-        // FIXME: This is too simplistic for correct Unicode support.
-        //        See also: char_eq
-        // FIXME: Standard library now yields iterators, so we should take
-        //        advantage of them.
-        textc = textc.to_uppercase().next().unwrap();
-        start = start.to_uppercase().next().unwrap();
-        end = end.to_uppercase().next().unwrap();
-    }
+fn class_cmp(textc: char, (start, end): (char, char)) -> Ordering {
     if textc >= start && textc <= end {
         Ordering::Equal
     } else if start > textc {