Skip to content

Commit f079107

Browse files
committed
Merge pull request #78 from SimonSapin/case-folding
Fix case-insensitivity-related bugs
2 parents 399758a + a3459ce commit f079107

File tree

7 files changed

+512
-48
lines changed

7 files changed

+512
-48
lines changed

regex_macros/src/lib.rs

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ use regex::native::{
4040
Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
4141
Program, Dynamic, ExDynamic, Native,
4242
FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED,
43+
simple_case_fold,
4344
};
4445

4546
/// For the `regex!` syntax extension. Do not use.
@@ -154,7 +155,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
154155
use regex::native::{
155156
MatchKind, Exists, Location, Submatches,
156157
StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
157-
CharReader, find_prefix,
158+
CharReader, find_prefix, simple_case_fold,
158159
};
159160

160161
return Nfa {
@@ -459,11 +460,9 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
459460
}
460461
OneChar(c, flags) => {
461462
if flags & FLAG_NOCASE > 0 {
462-
let upc = c.to_uppercase().next().unwrap();
463+
let upc = simple_case_fold(c);
463464
quote_expr!(self.cx, {
464-
let upc = self.chars.prev.map(|c| {
465-
c.to_uppercase().next().unwrap()
466-
});
465+
let upc = self.chars.prev.map(simple_case_fold);
467466
if upc == Some($upc) {
468467
self.add(nlist, $nextpc, caps);
469468
}
@@ -483,8 +482,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
483482
if casei {
484483
quote_expr!(
485484
self.cx,
486-
self.chars.prev.unwrap()
487-
.to_uppercase().next().unwrap())
485+
simple_case_fold(self.chars.prev.unwrap()))
488486
} else {
489487
quote_expr!(self.cx, self.chars.prev.unwrap())
490488
};
@@ -494,7 +492,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
494492
} else {
495493
quote_expr!(self.cx, found)
496494
};
497-
let mranges = self.match_class(casei, &ranges);
495+
let mranges = self.match_class(&ranges);
498496
quote_expr!(self.cx, {
499497
if self.chars.prev.is_some() {
500498
let c = $get_char;
@@ -529,12 +527,8 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
529527
// Translates a character class into a match expression.
530528
// This avoids a binary search (and is hopefully replaced by a jump
531529
// table).
532-
fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> P<ast::Expr> {
533-
let mut arms = ranges.iter().map(|&(mut start, mut end)| {
534-
if casei {
535-
start = start.to_uppercase().next().unwrap();
536-
end = end.to_uppercase().next().unwrap();
537-
}
530+
fn match_class(&self, ranges: &[(char, char)]) -> P<ast::Expr> {
531+
let mut arms = ranges.iter().map(|&(start, end)| {
538532
let pat = self.cx.pat(self.sp, ast::PatRange(quote_expr!(self.cx, $start),
539533
quote_expr!(self.cx, $end)));
540534
self.cx.arm(self.sp, vec!(pat), quote_expr!(self.cx, true))

regex_macros/tests/tests.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,9 @@ mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10)));
330330
mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10)));
331331
mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)));
332332

333+
// https://github.com/rust-lang/regex/issues/76
334+
mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10)));
335+
333336
// Test the Unicode friendliness of Perl character classes.
334337
mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)));
335338
mat!(uni_perl_w_not, r"\w+", "⥡", None);
@@ -355,6 +358,10 @@ mat!(negclass_space_comma, r"[^,\s]", ", a", Some((2, 3)));
355358
mat!(negclass_comma_space, r"[^\s,]", " ,a", Some((2, 3)));
356359
mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2)));
357360

361+
// Regression test for https://github.com/rust-lang/regex/issues/75
362+
mat!(regression_unsorted_binary_search_1, r"(?i)[a_]+", "A_", Some((0, 2)));
363+
mat!(regression_unsorted_binary_search_2, r"(?i)[A_]+", "a_", Some((0, 2)));
364+
358365
// A whole mess of tests from Glenn Fowler's regex test suite.
359366
// Generated by the 'src/etc/regex-match-tests' program.
360367
#[path = "matches.rs"]

scripts/unicode.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,19 @@ def load_properties(f, interestingprops):
194194

195195
return props
196196

197+
def load_case_folding(f):
198+
fetch(f)
199+
re1 = re.compile("^ *([0-9A-F]+) *; *[CS] *; *([0-9A-F]+) *;")
200+
c_plus_s = []
201+
for line in fileinput.input(f):
202+
m = re1.match(line)
203+
if m:
204+
a = int(m.group(1), 16)
205+
b = int(m.group(2), 16)
206+
c_plus_s.append((a, b))
207+
208+
return {"C_plus_S": c_plus_s}
209+
197210
def escape_char(c):
198211
return "'\\u{%x}'" % c
199212

@@ -258,6 +271,7 @@ def emit_regex_module(f, cats, w_data):
258271
scripts = load_properties("Scripts.txt", [])
259272
props = load_properties("PropList.txt",
260273
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
274+
case_folding = load_case_folding("CaseFolding.txt")
261275

262276
# all of these categories will also be available as \p{} in libregex
263277
allcats = []
@@ -280,3 +294,4 @@ def emit_regex_module(f, cats, w_data):
280294

281295
# emit lookup tables for \p{}, along with \d, \w, and \s for libregex
282296
emit_regex_module(rf, allcats, perl_words)
297+
emit_property_module(rf, "case_folding", case_folding)

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,7 @@ pub mod native {
413413
};
414414
pub use re::{ExDynamic, ExNative};
415415
pub use re::Regex::{Dynamic, Native};
416-
pub use vm::{CharReader, find_prefix};
416+
pub use vm::{CharReader, find_prefix, simple_case_fold};
417417
pub use vm::MatchKind::{self, Exists, Location, Submatches};
418418
pub use vm::StepState::{
419419
self, StepMatchEarlyReturn, StepMatch, StepContinue,

src/parse.rs

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use std::fmt;
1414

1515
/// Static data containing Unicode ranges for general categories and scripts.
1616
use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
17+
use vm::simple_case_fold;
1718

1819
use self::Ast::*;
1920
use self::Repeater::*;
@@ -213,7 +214,14 @@ impl Parser {
213214
'?' | '*' | '+' => try!(self.push_repeater(c)),
214215
'\\' => {
215216
let ast = try!(self.parse_escape());
216-
self.push(ast)
217+
if let AstClass(mut ranges, flags) = ast {
218+
if flags & FLAG_NOCASE > 0 {
219+
ranges = case_fold_and_combine_ranges(ranges);
220+
}
221+
self.push(AstClass(ranges, flags))
222+
} else {
223+
self.push(ast)
224+
}
217225
}
218226
'{' => try!(self.parse_counted()),
219227
'[' => match self.try_parse_ascii() {
@@ -421,7 +429,11 @@ impl Parser {
421429
}
422430
}
423431
']' if ranges.len() > 0 => {
424-
ranges = combine_ranges(ranges);
432+
if self.flags & FLAG_NOCASE > 0 {
433+
ranges = case_fold_and_combine_ranges(ranges)
434+
} else {
435+
ranges = combine_ranges(ranges);
436+
}
425437
if negated {
426438
ranges = invert_ranges(ranges);
427439
}
@@ -976,6 +988,35 @@ fn combine_ranges(mut unordered: Vec<(char, char)>) -> Vec<(char, char)> {
976988
ordered
977989
}
978990

991+
// FIXME: Is there a clever way to do this by considering ranges rather than individual chars?
992+
// E.g. binary search for overlap with entries in unicode::case_folding::C_plus_S_table
993+
fn case_fold_and_combine_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> {
994+
if ranges.is_empty() {
995+
return ranges
996+
}
997+
let mut chars: Vec<char> = ranges
998+
.into_iter()
999+
.flat_map(|(start, end)| start as u32 .. end as u32 + 1)
1000+
.filter_map(char::from_u32)
1001+
.map(simple_case_fold)
1002+
.collect();
1003+
chars.sort();
1004+
chars.dedup();
1005+
let mut chars = chars.into_iter();
1006+
let mut start = chars.next().unwrap();
1007+
let mut end = start;
1008+
let mut ranges = Vec::new();
1009+
for c in chars {
1010+
if c != inc_char(end) {
1011+
ranges.push((start, end));
1012+
start = c;
1013+
}
1014+
end = c;
1015+
}
1016+
ranges.push((start, end));
1017+
ranges
1018+
}
1019+
9791020
fn invert_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> {
9801021
if ranges.is_empty() { return ranges; }
9811022

0 commit comments

Comments
 (0)