Skip to content

Commit c436bfd

Browse files
committed
Support nested character classes and intersection with &&
This implements parts of UTS#18 RL1.3, namely: * Nested character classes, e.g.: `[a[b-c]]` * Intersections in classes, e.g.: `[\w&&\p{Greek}]` They can be combined to do things like `[\w&&[^a]]` to get all word characters except `a`. Fixes #341
1 parent 204e409 commit c436bfd

File tree

2 files changed

+567
-54
lines changed

2 files changed

+567
-54
lines changed

regex-syntax/src/lib.rs

Lines changed: 158 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -680,7 +680,7 @@ impl CharClass {
680680
self.canonicalize()
681681
}
682682

683-
/// Canonicalze any sequence of ranges.
683+
/// Canonicalize any sequence of ranges.
684684
///
685685
/// This is responsible for enforcing the canonical format invariants
686686
/// as described on the docs for the `CharClass` type.
@@ -703,6 +703,43 @@ impl CharClass {
703703
ordered
704704
}
705705

706+
/// Calculate the intersection of two canonical character classes.
707+
///
708+
/// The returned intersection is canonical.
709+
fn intersection(&self, other: &CharClass) -> CharClass {
710+
if self.ranges.is_empty() || other.ranges.is_empty() {
711+
return CharClass::empty();
712+
}
713+
714+
let mut intersection = CharClass::empty();
715+
716+
let mut iter_a = self.ranges.iter();
717+
let mut iter_b = other.ranges.iter();
718+
let mut a = iter_a.next().unwrap();
719+
let mut b = iter_b.next().unwrap();
720+
loop {
721+
if let Some(i) = a.intersection(&b) {
722+
intersection.ranges.push(i);
723+
}
724+
725+
// If the range with the smaller end didn't match this time,
726+
// it won't ever match, so move on to the next one.
727+
let (iter, item) = if a.end < b.end {
728+
(&mut iter_a, &mut a)
729+
} else {
730+
(&mut iter_b, &mut b)
731+
};
732+
if let Some(v) = iter.next() {
733+
*item = v;
734+
} else {
735+
// No more ranges to check, done.
736+
break;
737+
}
738+
}
739+
740+
intersection.canonicalize()
741+
}
742+
706743
/// Negates the character class.
707744
///
708745
/// For all `c` where `c` is a Unicode scalar value, `c` matches `self`
@@ -801,6 +838,18 @@ impl ClassRange {
801838
max(self.start, other.start) <= inc_char(min(self.end, other.end))
802839
}
803840

841+
/// Returns the intersection of the two ranges if they have common
842+
/// characters, `None` otherwise.
843+
fn intersection(&self, other: &ClassRange) -> Option<ClassRange> {
844+
let start = max(self.start, other.start);
845+
let end = min(self.end, other.end);
846+
if start <= end {
847+
Some(ClassRange::new(start, end))
848+
} else {
849+
None
850+
}
851+
}
852+
804853
/// Creates a new range representing the union of `self` and `other.
805854
fn merge(self, other: ClassRange) -> ClassRange {
806855
ClassRange {
@@ -1907,6 +1956,108 @@ mod tests {
19071956
]));
19081957
}
19091958

1959+
#[test]
1960+
fn class_intersection_empty() {
1961+
let cls1 = class(&[]);
1962+
let cls2 = class(&[('a', 'a')]);
1963+
assert_intersection(cls1, cls2, class(&[]));
1964+
}
1965+
1966+
#[test]
1967+
fn class_intersection_single_equal() {
1968+
let cls1 = class(&[('a', 'a')]);
1969+
let cls2 = class(&[('a', 'a')]);
1970+
assert_intersection(cls1, cls2, class(&[('a', 'a')]));
1971+
}
1972+
1973+
#[test]
1974+
fn class_intersection_single_unequal() {
1975+
let cls1 = class(&[('a', 'a')]);
1976+
let cls2 = class(&[('b', 'b')]);
1977+
assert_intersection(cls1, cls2, class(&[]));
1978+
}
1979+
1980+
#[test]
1981+
fn class_intersection_single_in_other() {
1982+
let cls1 = class(&[('a', 'a')]);
1983+
let cls2 = class(&[('a', 'c')]);
1984+
assert_intersection(cls1, cls2, class(&[('a', 'a')]));
1985+
}
1986+
1987+
#[test]
1988+
fn class_intersection_range_in_other() {
1989+
let cls1 = class(&[('a', 'b')]);
1990+
let cls2 = class(&[('a', 'c')]);
1991+
assert_intersection(cls1, cls2, class(&[('a', 'b')]));
1992+
}
1993+
1994+
#[test]
1995+
fn class_intersection_range_intersection() {
1996+
let cls1 = class(&[('a', 'b')]);
1997+
let cls2 = class(&[('b', 'c')]);
1998+
assert_intersection(cls1, cls2, class(&[('b', 'b')]));
1999+
}
2000+
2001+
#[test]
2002+
fn class_intersection_only_adjacent() {
2003+
let cls1 = class(&[('a', 'b')]);
2004+
let cls2 = class(&[('c', 'd')]);
2005+
assert_intersection(cls1, cls2, class(&[]));
2006+
}
2007+
2008+
#[test]
2009+
fn class_intersection_range_subset() {
2010+
let cls1 = class(&[('b', 'c')]);
2011+
let cls2 = class(&[('a', 'd')]);
2012+
assert_intersection(cls1, cls2, class(&[('b', 'c')]));
2013+
}
2014+
2015+
#[test]
2016+
fn class_intersection_many_ranges_in_one_big() {
2017+
let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2018+
let cls2 = class(&[('a', 'h')]);
2019+
assert_intersection(cls1, cls2, class(&[
2020+
('a', 'b'), ('d', 'e'), ('g', 'h')
2021+
]));
2022+
}
2023+
2024+
#[test]
2025+
fn class_intersection_many_ranges_same() {
2026+
let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2027+
let cls2 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2028+
assert_intersection(cls1, cls2, class(&[
2029+
('a', 'b'), ('d', 'e'), ('g', 'h')
2030+
]));
2031+
}
2032+
2033+
#[test]
2034+
fn class_intersection_multiple_non_intersecting() {
2035+
let cls1 = class(&[('a', 'b'), ('g', 'h')]);
2036+
let cls2 = class(&[('d', 'e'), ('k', 'l')]);
2037+
assert_intersection(cls1, cls2, class(&[]));
2038+
}
2039+
2040+
#[test]
2041+
fn class_intersection_non_intersecting_then_intersecting() {
2042+
let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2043+
let cls2 = class(&[('h', 'h')]);
2044+
assert_intersection(cls1, cls2, class(&[('h', 'h')]));
2045+
}
2046+
2047+
#[test]
2048+
fn class_intersection_adjacent_alternating() {
2049+
let cls1 = class(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
2050+
let cls2 = class(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
2051+
assert_intersection(cls1, cls2, class(&[]));
2052+
}
2053+
2054+
#[test]
2055+
fn class_intersection_overlapping_alternating() {
2056+
let cls1 = class(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
2057+
let cls2 = class(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
2058+
assert_intersection(cls1, cls2, class(&[('b', 'f')]));
2059+
}
2060+
19102061
#[test]
19112062
fn class_canon_overlap_many_case_fold() {
19122063
let cls = class(&[
@@ -2056,4 +2207,10 @@ mod tests {
20562207
let expr = e("(?-u)[-./]");
20572208
assert_eq!("(?-u:[-\\.-/])", expr.to_string());
20582209
}
2210+
2211+
fn assert_intersection(cls1: CharClass, cls2: CharClass, expected: CharClass) {
2212+
// intersection operation should be commutative
2213+
assert_eq!(cls1.intersection(&cls2), expected);
2214+
assert_eq!(cls2.intersection(&cls1), expected);
2215+
}
20592216
}

0 commit comments

Comments
 (0)