Skip to content

Commit 7b26028

Browse files
authored
Support text segment boundary anchors (#178)
This enables the `\y` and `\Y` anchors in regex literals and `Anchor.textSegmentBoundary` in the DSL. Note: This also includes `UnicodeScalar` conformance to `RegexProtocol`, which acts like Unicode scalar literals in regex literals.
1 parent 996c6d3 commit 7b26028

File tree

4 files changed

+43
-11
lines changed

4 files changed

+43
-11
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,12 +99,16 @@ extension Compiler.ByteCodeGen {
9999
}
100100

101101
case .textSegment:
102-
// This we should be able to do!
103-
throw Unsupported(#"\y (text segment)"#)
102+
builder.buildAssert { (input, pos, _) in
103+
// FIXME: Grapheme or word based on options
104+
input.isOnGraphemeClusterBoundary(pos)
105+
}
104106

105107
case .notTextSegment:
106-
// This we should be able to do!
107-
throw Unsupported(#"\Y (not text segment)"#)
108+
builder.buildAssert { (input, pos, _) in
109+
// FIXME: Grapheme or word based on options
110+
!input.isOnGraphemeClusterBoundary(pos)
111+
}
108112

109113
case .startOfLine:
110114
builder.buildAssert { (input, pos, bounds) in

Sources/_StringProcessing/RegexDSL/DSL.swift

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,25 +17,31 @@ extension String: RegexProtocol {
1717
public typealias Match = Substring
1818

1919
public var regex: Regex<Match> {
20-
let atoms = self.map { atom(.char($0)) }
21-
return .init(ast: concat(atoms))
20+
.init(node: .quotedLiteral(self))
2221
}
2322
}
2423

2524
extension Substring: RegexProtocol {
2625
public typealias Match = Substring
2726

2827
public var regex: Regex<Match> {
29-
let atoms = self.map { atom(.char($0)) }
30-
return .init(ast: concat(atoms))
28+
.init(node: .quotedLiteral(String(self)))
3129
}
3230
}
3331

3432
extension Character: RegexProtocol {
3533
public typealias Match = Substring
3634

3735
public var regex: Regex<Match> {
38-
.init(ast: atom(.char(self)))
36+
.init(node: .atom(.char(self)))
37+
}
38+
}
39+
40+
extension UnicodeScalar: RegexProtocol {
41+
public typealias Match = Substring
42+
43+
public var regex: Regex<Match> {
44+
.init(node: .atom(.scalar(self)))
3945
}
4046
}
4147

Tests/RegexTests/MatchTests.swift

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -877,7 +877,8 @@ extension RegexTests {
877877
#"\d+\b"#,
878878
("123", "123"),
879879
(" 123", "123"),
880-
("123 456", "123"))
880+
("123 456", "123"),
881+
("123A 456", "456"))
881882
firstMatchTests(
882883
#"\d+\b\s\b\d+"#,
883884
("123", nil),
@@ -893,7 +894,18 @@ extension RegexTests {
893894
// TODO: \G and \K
894895

895896
// TODO: Oniguruma \y and \Y
896-
897+
firstMatchTests(
898+
#"\u{65}"#, // Scalar 'e' is present in both:
899+
("Cafe\u{301}", "e"), // composed and
900+
("Sol Cafe", "e")) // standalone
901+
firstMatchTests(
902+
#"\u{65}\y"#, // Grapheme boundary assertion
903+
("Cafe\u{301}", nil),
904+
("Sol Cafe", "e"))
905+
firstMatchTests(
906+
#"\u{65}\Y"#, // Grapheme non-boundary assertion
907+
("Cafe\u{301}", "e"),
908+
("Sol Cafe", nil))
897909
}
898910

899911
func testMatchGroups() {

Tests/RegexTests/RegexDSLTests.swift

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,16 @@ class RegexDSLTests: XCTestCase {
280280
Anchor.endOfLine
281281
}
282282

283+
try _testDSLCaptures(
284+
("Cafe\u{301}", nil),
285+
("Cafe", "Cafe"),
286+
matchType: Substring.self, ==)
287+
{
288+
oneOrMore(.word)
289+
UnicodeScalar("e")
290+
Anchor.textSegmentBoundary
291+
}
292+
283293
try _testDSLCaptures(
284294
("aaaaa1", "aaaaa1"),
285295
("aaaaa2", nil),

0 commit comments

Comments
 (0)