Use Unicode simple case folding for case-insensitivity. Fix #55.

SimonSapin · SimonSapin · commit a3459ce726d4 · 2015-04-19T18:09:45.000+02:00
diff --git a/regex_macros/src/lib.rs b/regex_macros/src/lib.rs
@@ -40,6 +40,7 @@ use regex::native::{
     Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
     Program, Dynamic, ExDynamic, Native,
     FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED,
+    simple_case_fold,
 };
 
 /// For the `regex!` syntax extension. Do not use.
@@ -154,7 +155,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
     use regex::native::{
         MatchKind, Exists, Location, Submatches,
         StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
-        CharReader, find_prefix,
+        CharReader, find_prefix, simple_case_fold,
     };
 
     return Nfa {
@@ -459,11 +460,9 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
                 }
                 OneChar(c, flags) => {
                     if flags & FLAG_NOCASE > 0 {
-                        let upc = c.to_uppercase().next().unwrap();
+                        let upc = simple_case_fold(c);
                         quote_expr!(self.cx, {
-                            let upc = self.chars.prev.map(|c| {
-                                c.to_uppercase().next().unwrap()
-                            });
+                            let upc = self.chars.prev.map(simple_case_fold);
                             if upc == Some($upc) {
                                 self.add(nlist, $nextpc, caps);
                             }
@@ -483,8 +482,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
                         if casei {
                             quote_expr!(
                                 self.cx,
-                                self.chars.prev.unwrap()
-                                    .to_uppercase().next().unwrap())
+                                simple_case_fold(self.chars.prev.unwrap()))
                         } else {
                             quote_expr!(self.cx, self.chars.prev.unwrap())
                         };
diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -194,6 +194,19 @@ def load_properties(f, interestingprops):
 
     return props
 
+def load_case_folding(f):
+    fetch(f)
+    re1 = re.compile("^ *([0-9A-F]+) *; *[CS] *; *([0-9A-F]+) *;")
+    c_plus_s = []
+    for line in fileinput.input(f):
+        m = re1.match(line)
+        if m:
+            a = int(m.group(1), 16)
+            b = int(m.group(2), 16)
+            c_plus_s.append((a, b))
+
+    return {"C_plus_S": c_plus_s}
+
 def escape_char(c):
     return "'\\u{%x}'" % c
 
@@ -258,6 +271,7 @@ def emit_regex_module(f, cats, w_data):
         scripts = load_properties("Scripts.txt", [])
         props = load_properties("PropList.txt",
                 ["White_Space", "Join_Control", "Noncharacter_Code_Point"])
+        case_folding = load_case_folding("CaseFolding.txt")
 
         # all of these categories will also be available as \p{} in libregex
         allcats = []
@@ -280,3 +294,4 @@ def emit_regex_module(f, cats, w_data):
 
         # emit lookup tables for \p{}, along with \d, \w, and \s for libregex
         emit_regex_module(rf, allcats, perl_words)
+        emit_property_module(rf, "case_folding", case_folding)
diff --git a/src/lib.rs b/src/lib.rs
@@ -413,7 +413,7 @@ pub mod native {
     };
     pub use re::{ExDynamic, ExNative};
     pub use re::Regex::{Dynamic, Native};
-    pub use vm::{CharReader, find_prefix};
+    pub use vm::{CharReader, find_prefix, simple_case_fold};
     pub use vm::MatchKind::{self, Exists, Location, Submatches};
     pub use vm::StepState::{
         self, StepMatchEarlyReturn, StepMatch, StepContinue,
diff --git a/src/parse.rs b/src/parse.rs
@@ -14,6 +14,7 @@ use std::fmt;
 
 /// Static data containing Unicode ranges for general categories and scripts.
 use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
+use vm::simple_case_fold;
 
 use self::Ast::*;
 use self::Repeater::*;
@@ -987,6 +988,8 @@ fn combine_ranges(mut unordered: Vec<(char, char)>) -> Vec<(char, char)> {
     ordered
 }
 
+// FIXME: Is there a clever way to do this by considering ranges rather than individual chars?
+// E.g. binary search for overlap with entries in unicode::case_folding::C_plus_S_table
 fn case_fold_and_combine_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> {
     if ranges.is_empty() {
         return ranges
@@ -995,9 +998,10 @@ fn case_fold_and_combine_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)>
         .into_iter()
         .flat_map(|(start, end)| start as u32 .. end as u32 + 1)
         .filter_map(char::from_u32)
-        .map(|c| c.to_uppercase().next().unwrap())
+        .map(simple_case_fold)
         .collect();
     chars.sort();
+    chars.dedup();
     let mut chars = chars.into_iter();
     let mut start = chars.next().unwrap();
     let mut end = start;
diff --git a/src/unicode.rs b/src/unicode.rs
diff --git a/src/vm.rs b/src/vm.rs