Skip to content

Commit a3459ce

Browse files
committed
Use Unicode simple case folding for case-insensitivity. Fix #55.
1 parent 2561bfd commit a3459ce

File tree

6 files changed

+459
-18
lines changed

6 files changed

+459
-18
lines changed

regex_macros/src/lib.rs

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ use regex::native::{
4040
Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
4141
Program, Dynamic, ExDynamic, Native,
4242
FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED,
43+
simple_case_fold,
4344
};
4445

4546
/// For the `regex!` syntax extension. Do not use.
@@ -154,7 +155,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
154155
use regex::native::{
155156
MatchKind, Exists, Location, Submatches,
156157
StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
157-
CharReader, find_prefix,
158+
CharReader, find_prefix, simple_case_fold,
158159
};
159160

160161
return Nfa {
@@ -459,11 +460,9 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
459460
}
460461
OneChar(c, flags) => {
461462
if flags & FLAG_NOCASE > 0 {
462-
let upc = c.to_uppercase().next().unwrap();
463+
let upc = simple_case_fold(c);
463464
quote_expr!(self.cx, {
464-
let upc = self.chars.prev.map(|c| {
465-
c.to_uppercase().next().unwrap()
466-
});
465+
let upc = self.chars.prev.map(simple_case_fold);
467466
if upc == Some($upc) {
468467
self.add(nlist, $nextpc, caps);
469468
}
@@ -483,8 +482,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
483482
if casei {
484483
quote_expr!(
485484
self.cx,
486-
self.chars.prev.unwrap()
487-
.to_uppercase().next().unwrap())
485+
simple_case_fold(self.chars.prev.unwrap()))
488486
} else {
489487
quote_expr!(self.cx, self.chars.prev.unwrap())
490488
};

scripts/unicode.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,19 @@ def load_properties(f, interestingprops):
194194

195195
return props
196196

197+
def load_case_folding(f):
198+
fetch(f)
199+
re1 = re.compile("^ *([0-9A-F]+) *; *[CS] *; *([0-9A-F]+) *;")
200+
c_plus_s = []
201+
for line in fileinput.input(f):
202+
m = re1.match(line)
203+
if m:
204+
a = int(m.group(1), 16)
205+
b = int(m.group(2), 16)
206+
c_plus_s.append((a, b))
207+
208+
return {"C_plus_S": c_plus_s}
209+
197210
def escape_char(c):
198211
return "'\\u{%x}'" % c
199212

@@ -258,6 +271,7 @@ def emit_regex_module(f, cats, w_data):
258271
scripts = load_properties("Scripts.txt", [])
259272
props = load_properties("PropList.txt",
260273
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
274+
case_folding = load_case_folding("CaseFolding.txt")
261275

262276
# all of these categories will also be available as \p{} in libregex
263277
allcats = []
@@ -280,3 +294,4 @@ def emit_regex_module(f, cats, w_data):
280294

281295
# emit lookup tables for \p{}, along with \d, \w, and \s for libregex
282296
emit_regex_module(rf, allcats, perl_words)
297+
emit_property_module(rf, "case_folding", case_folding)

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,7 @@ pub mod native {
413413
};
414414
pub use re::{ExDynamic, ExNative};
415415
pub use re::Regex::{Dynamic, Native};
416-
pub use vm::{CharReader, find_prefix};
416+
pub use vm::{CharReader, find_prefix, simple_case_fold};
417417
pub use vm::MatchKind::{self, Exists, Location, Submatches};
418418
pub use vm::StepState::{
419419
self, StepMatchEarlyReturn, StepMatch, StepContinue,

src/parse.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use std::fmt;
1414

1515
/// Static data containing Unicode ranges for general categories and scripts.
1616
use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
17+
use vm::simple_case_fold;
1718

1819
use self::Ast::*;
1920
use self::Repeater::*;
@@ -987,6 +988,8 @@ fn combine_ranges(mut unordered: Vec<(char, char)>) -> Vec<(char, char)> {
987988
ordered
988989
}
989990

991+
// FIXME: Is there a clever way to do this by considering ranges rather than individual chars?
992+
// E.g. binary search for overlap with entries in unicode::case_folding::C_plus_S_table
990993
fn case_fold_and_combine_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> {
991994
if ranges.is_empty() {
992995
return ranges
@@ -995,9 +998,10 @@ fn case_fold_and_combine_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)>
995998
.into_iter()
996999
.flat_map(|(start, end)| start as u32 .. end as u32 + 1)
9971000
.filter_map(char::from_u32)
998-
.map(|c| c.to_uppercase().next().unwrap())
1001+
.map(simple_case_fold)
9991002
.collect();
10001003
chars.sort();
1004+
chars.dedup();
10011005
let mut chars = chars.into_iter();
10021006
let mut start = chars.next().unwrap();
10031007
let mut end = start;

0 commit comments

Comments
 (0)