Skip to content

Commit 2561bfd

Browse files
committed
Fix #76 case-insensitive matching of character ranges
1 parent 32974d5 commit 2561bfd

File tree

2 files changed

+40
-7
lines changed

2 files changed

+40
-7
lines changed

regex_macros/tests/tests.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,9 @@ mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10)));
330330
mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10)));
331331
mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)));
332332

333+
// https://github.com/rust-lang/regex/issues/76
334+
mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10)));
335+
333336
// Test the Unicode friendliness of Perl character classes.
334337
mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)));
335338
mat!(uni_perl_w_not, r"\w+", "⥡", None);

src/parse.rs

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,14 @@ impl Parser {
213213
'?' | '*' | '+' => try!(self.push_repeater(c)),
214214
'\\' => {
215215
let ast = try!(self.parse_escape());
216-
self.push(ast)
216+
if let AstClass(mut ranges, flags) = ast {
217+
if flags & FLAG_NOCASE > 0 {
218+
ranges = case_fold_and_combine_ranges(ranges);
219+
}
220+
self.push(AstClass(ranges, flags))
221+
} else {
222+
self.push(ast)
223+
}
217224
}
218225
'{' => try!(self.parse_counted()),
219226
'[' => match self.try_parse_ascii() {
@@ -422,13 +429,10 @@ impl Parser {
422429
}
423430
']' if ranges.len() > 0 => {
424431
if self.flags & FLAG_NOCASE > 0 {
425-
// FIMXE(https://github.com/rust-lang/regex/issues/76): This is wrong.
426-
for range in &mut ranges {
427-
range.0 = range.0.to_uppercase().next().unwrap();
428-
range.1 = range.1.to_uppercase().next().unwrap();
429-
}
432+
ranges = case_fold_and_combine_ranges(ranges)
433+
} else {
434+
ranges = combine_ranges(ranges);
430435
}
431-
ranges = combine_ranges(ranges);
432436
if negated {
433437
ranges = invert_ranges(ranges);
434438
}
@@ -983,6 +987,32 @@ fn combine_ranges(mut unordered: Vec<(char, char)>) -> Vec<(char, char)> {
983987
ordered
984988
}
985989

990+
fn case_fold_and_combine_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> {
991+
if ranges.is_empty() {
992+
return ranges
993+
}
994+
let mut chars: Vec<char> = ranges
995+
.into_iter()
996+
.flat_map(|(start, end)| start as u32 .. end as u32 + 1)
997+
.filter_map(char::from_u32)
998+
.map(|c| c.to_uppercase().next().unwrap())
999+
.collect();
1000+
chars.sort();
1001+
let mut chars = chars.into_iter();
1002+
let mut start = chars.next().unwrap();
1003+
let mut end = start;
1004+
let mut ranges = Vec::new();
1005+
for c in chars {
1006+
if c != inc_char(end) {
1007+
ranges.push((start, end));
1008+
start = c;
1009+
}
1010+
end = c;
1011+
}
1012+
ranges.push((start, end));
1013+
ranges
1014+
}
1015+
9861016
fn invert_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> {
9871017
if ranges.is_empty() { return ranges; }
9881018

0 commit comments

Comments
 (0)