Merge pull request #78 from SimonSapin/case-folding

BurntSushi · BurntSushi · commit f0791074ff2b · 2015-04-19T16:31:34.000-04:00
Fix case-insensitivity-related bugs
diff --git a/regex_macros/src/lib.rs b/regex_macros/src/lib.rs
@@ -40,6 +40,7 @@ use regex::native::{
     Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
     Program, Dynamic, ExDynamic, Native,
     FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED,
+    simple_case_fold,
 };
 
 /// For the `regex!` syntax extension. Do not use.
@@ -154,7 +155,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
     use regex::native::{
         MatchKind, Exists, Location, Submatches,
         StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
-        CharReader, find_prefix,
+        CharReader, find_prefix, simple_case_fold,
     };
 
     return Nfa {
@@ -459,11 +460,9 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
                 }
                 OneChar(c, flags) => {
                     if flags & FLAG_NOCASE > 0 {
-                        let upc = c.to_uppercase().next().unwrap();
+                        let upc = simple_case_fold(c);
                         quote_expr!(self.cx, {
-                            let upc = self.chars.prev.map(|c| {
-                                c.to_uppercase().next().unwrap()
-                            });
+                            let upc = self.chars.prev.map(simple_case_fold);
                             if upc == Some($upc) {
                                 self.add(nlist, $nextpc, caps);
                             }
@@ -483,8 +482,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
                         if casei {
                             quote_expr!(
                                 self.cx,
-                                self.chars.prev.unwrap()
-                                    .to_uppercase().next().unwrap())
+                                simple_case_fold(self.chars.prev.unwrap()))
                         } else {
                             quote_expr!(self.cx, self.chars.prev.unwrap())
                         };
@@ -494,7 +492,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
                         } else {
                             quote_expr!(self.cx, found)
                         };
-                    let mranges = self.match_class(casei, &ranges);
+                    let mranges = self.match_class(&ranges);
                     quote_expr!(self.cx, {
                         if self.chars.prev.is_some() {
                             let c = $get_char;
@@ -529,12 +527,8 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
     // Translates a character class into a match expression.
     // This avoids a binary search (and is hopefully replaced by a jump
     // table).
-    fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> P<ast::Expr> {
-        let mut arms = ranges.iter().map(|&(mut start, mut end)| {
-            if casei {
-                start = start.to_uppercase().next().unwrap();
-                end = end.to_uppercase().next().unwrap();
-            }
+    fn match_class(&self, ranges: &[(char, char)]) -> P<ast::Expr> {
+        let mut arms = ranges.iter().map(|&(start, end)| {
             let pat = self.cx.pat(self.sp, ast::PatRange(quote_expr!(self.cx, $start),
                                                          quote_expr!(self.cx, $end)));
             self.cx.arm(self.sp, vec!(pat), quote_expr!(self.cx, true))
diff --git a/regex_macros/tests/tests.rs b/regex_macros/tests/tests.rs
@@ -330,6 +330,9 @@ mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10)));
 mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10)));
 mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)));
 
+// https://github.com/rust-lang/regex/issues/76
+mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10)));
+
 // Test the Unicode friendliness of Perl character classes.
 mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)));
 mat!(uni_perl_w_not, r"\w+", "⥡", None);
@@ -355,6 +358,10 @@ mat!(negclass_space_comma, r"[^,\s]", ", a", Some((2, 3)));
 mat!(negclass_comma_space, r"[^\s,]", " ,a", Some((2, 3)));
 mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2)));
 
+// Regression test for https://github.com/rust-lang/regex/issues/75
+mat!(regression_unsorted_binary_search_1, r"(?i)[a_]+", "A_", Some((0, 2)));
+mat!(regression_unsorted_binary_search_2, r"(?i)[A_]+", "a_", Some((0, 2)));
+
 // A whole mess of tests from Glenn Fowler's regex test suite.
 // Generated by the 'src/etc/regex-match-tests' program.
 #[path = "matches.rs"]
diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -194,6 +194,19 @@ def load_properties(f, interestingprops):
 
     return props
 
+def load_case_folding(f):
+    fetch(f)
+    re1 = re.compile("^ *([0-9A-F]+) *; *[CS] *; *([0-9A-F]+) *;")
+    c_plus_s = []
+    for line in fileinput.input(f):
+        m = re1.match(line)
+        if m:
+            a = int(m.group(1), 16)
+            b = int(m.group(2), 16)
+            c_plus_s.append((a, b))
+
+    return {"C_plus_S": c_plus_s}
+
 def escape_char(c):
     return "'\\u{%x}'" % c
 
@@ -258,6 +271,7 @@ def emit_regex_module(f, cats, w_data):
         scripts = load_properties("Scripts.txt", [])
         props = load_properties("PropList.txt",
                 ["White_Space", "Join_Control", "Noncharacter_Code_Point"])
+        case_folding = load_case_folding("CaseFolding.txt")
 
         # all of these categories will also be available as \p{} in libregex
         allcats = []
@@ -280,3 +294,4 @@ def emit_regex_module(f, cats, w_data):
 
         # emit lookup tables for \p{}, along with \d, \w, and \s for libregex
         emit_regex_module(rf, allcats, perl_words)
+        emit_property_module(rf, "case_folding", case_folding)
diff --git a/src/lib.rs b/src/lib.rs
@@ -413,7 +413,7 @@ pub mod native {
     };
     pub use re::{ExDynamic, ExNative};
     pub use re::Regex::{Dynamic, Native};
-    pub use vm::{CharReader, find_prefix};
+    pub use vm::{CharReader, find_prefix, simple_case_fold};
     pub use vm::MatchKind::{self, Exists, Location, Submatches};
     pub use vm::StepState::{
         self, StepMatchEarlyReturn, StepMatch, StepContinue,
diff --git a/src/parse.rs b/src/parse.rs
@@ -14,6 +14,7 @@ use std::fmt;
 
 /// Static data containing Unicode ranges for general categories and scripts.
 use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
+use vm::simple_case_fold;
 
 use self::Ast::*;
 use self::Repeater::*;
@@ -213,7 +214,14 @@ impl Parser {
                 '?' | '*' | '+' => try!(self.push_repeater(c)),
                 '\\' => {
                     let ast = try!(self.parse_escape());
-                    self.push(ast)
+                    if let AstClass(mut ranges, flags) = ast {
+                        if flags & FLAG_NOCASE > 0 {
+                            ranges = case_fold_and_combine_ranges(ranges);
+                        }
+                        self.push(AstClass(ranges, flags))
+                    } else {
+                        self.push(ast)
+                    }
                 }
                 '{' => try!(self.parse_counted()),
                 '[' => match self.try_parse_ascii() {
@@ -421,7 +429,11 @@ impl Parser {
                     }
                 }
                 ']' if ranges.len() > 0 => {
-                    ranges = combine_ranges(ranges);
+                    if self.flags & FLAG_NOCASE > 0 {
+                        ranges = case_fold_and_combine_ranges(ranges)
+                    } else {
+                        ranges = combine_ranges(ranges);
+                    }
                     if negated {
                         ranges = invert_ranges(ranges);
                     }
@@ -976,6 +988,35 @@ fn combine_ranges(mut unordered: Vec<(char, char)>) -> Vec<(char, char)> {
     ordered
 }
 
+// FIXME: Is there a clever way to do this by considering ranges rather than individual chars?
+// E.g. binary search for overlap with entries in unicode::case_folding::C_plus_S_table
+fn case_fold_and_combine_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> {
+    if ranges.is_empty() {
+        return ranges
+    }
+    let mut chars: Vec<char> = ranges
+        .into_iter()
+        .flat_map(|(start, end)| start as u32 .. end as u32 + 1)
+        .filter_map(char::from_u32)
+        .map(simple_case_fold)
+        .collect();
+    chars.sort();
+    chars.dedup();
+    let mut chars = chars.into_iter();
+    let mut start = chars.next().unwrap();
+    let mut end = start;
+    let mut ranges = Vec::new();
+    for c in chars {
+        if c != inc_char(end) {
+            ranges.push((start, end));
+            start = c;
+        }
+        end = c;
+    }
+    ranges.push((start, end));
+    ranges
+}
+
 fn invert_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> {
     if ranges.is_empty() { return ranges; }
 
diff --git a/src/unicode.rs b/src/unicode.rs
diff --git a/src/vm.rs b/src/vm.rs