Skip to content

Commit 0e25188

Browse files
committed
Implement lookahead and negative lookahead assertions
1 parent bfea2bf commit 0e25188

File tree

4 files changed

+162
-47
lines changed

4 files changed

+162
-47
lines changed

Sources/_MatchingEngine/Regex/AST/Group.swift

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,4 +116,18 @@ extension AST.Group: _ASTPrintable {
116116
}
117117
}
118118

119+
extension AST.Group {
120+
/// If this group is a lookaround assertion, return its direction
121+
/// and whether it is positive or negative. Otherwise returns
122+
/// `nil`.
123+
public var lookaroundKind: (forwards: Bool, positive: Bool)? {
124+
switch self.kind.value {
125+
case .lookahead: return (true, true)
126+
case .negativeLookahead: return (true, false)
127+
case .lookbehind: return (false, true)
128+
case .negativeLookbehind: return (false, false)
129+
default: return nil
130+
}
131+
}
119132

133+
}

Sources/_StringProcessing/Compiler.swift

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,21 @@ class Compiler {
7979
case .trivia, .empty:
8080
break
8181

82-
// FIXME: This can't be right...
8382
case .group(let g):
84-
try emit(g.child)
83+
if let lookaround = g.lookaroundKind {
84+
try emitLookaround(lookaround, g.child)
85+
return
86+
}
87+
88+
switch g.kind.value {
89+
case .lookahead, .negativeLookahead,
90+
.lookbehind, .negativeLookbehind:
91+
fatalError("unreachable")
92+
93+
default:
94+
// FIXME: This can't be right...
95+
try emit(g.child)
96+
}
8597

8698
case .quantification(let quant):
8799
try emitQuantification(quant)
@@ -101,6 +113,53 @@ class Compiler {
101113
}
102114
}
103115

116+
func emitLookaround(
117+
_ kind: (forwards: Bool, positive: Bool),
118+
_ child: AST
119+
) throws {
120+
guard kind.forwards else {
121+
throw unsupported("backwards assertions")
122+
}
123+
124+
let positive = kind.positive
125+
/*
126+
save(restoringAt: success)
127+
save(restoringAt: intercept)
128+
<sub-pattern> // failure restores at intercept
129+
clearSavePoint // remove intercept
130+
<if negative>:
131+
clearSavePoint // remove success
132+
fail // positive->success, negative propagates
133+
intercept:
134+
<if positive>:
135+
clearSavePoint // remove success
136+
fail // positive propagates, negative->success
137+
success:
138+
...
139+
*/
140+
141+
let intercept = builder.makeAddress()
142+
let success = builder.makeAddress()
143+
144+
builder.buildSave(success)
145+
builder.buildSave(intercept)
146+
try emit(child)
147+
builder.buildClear()
148+
if !positive {
149+
builder.buildClear()
150+
}
151+
builder.buildFail()
152+
153+
builder.label(intercept)
154+
if positive {
155+
builder.buildClear()
156+
}
157+
builder.buildFail()
158+
159+
builder.label(success)
160+
}
161+
162+
104163
func compileQuantification(
105164
low: Int,
106165
high: Int?,

Tests/RegexTests/CompileTests.swift

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,28 @@ import XCTest
66

77
extension RegexTests {
88

9+
private func testCompilationEquivalence(
10+
_ equivs: [String]
11+
) throws {
12+
assert(!equivs.isEmpty)
13+
let progs = try equivs.map {
14+
try _compileRegex($0).engine.program
15+
}
16+
let ref = progs.first!
17+
for prog in progs.dropFirst() {
18+
guard ref.instructions.elementsEqual(
19+
prog.instructions) else {
20+
XCTFail("""
21+
Reference:
22+
\(ref)
23+
Current:
24+
\(prog)
25+
""")
26+
continue
27+
}
28+
}
29+
}
30+
931
func testCompileQuantification() throws {
1032

1133
// NOTE: While we might change how we compile
@@ -26,15 +48,22 @@ extension RegexTests {
2648
]
2749

2850
for row in equivalents {
29-
let progs = try row.map {
30-
try _compileRegex($0).engine.program
31-
}
32-
let ref = progs.first!
33-
for prog in progs.dropFirst() {
34-
XCTAssert(ref.instructions.elementsEqual(
35-
prog.instructions))
36-
37-
}
51+
try testCompilationEquivalence(row)
52+
}
53+
}
54+
55+
func testCompileGroups() throws {
56+
let equivalents: Array<[String]> = [
57+
["(?= assert)",
58+
"(*pla: assert)",
59+
"(*positive_lookahead: assert)"],
60+
["(?! assert)",
61+
"(*nla: assert)",
62+
"(*negative_lookahead: assert)"]
63+
]
64+
65+
for row in equivalents {
66+
try testCompilationEquivalence(row)
3867
}
3968
}
4069
}

Tests/RegexTests/MatchTests.swift

Lines changed: 49 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,9 @@ extension RegexTests {
438438
"--+", input: "123---xyz", match: "---")
439439
matchTest(
440440
"~~*", input: "123~~~xyz", match: "~~~")
441+
}
441442

443+
func testCharacterProperties() {
442444
// MARK: Character names.
443445

444446
matchTest(#"\N{ASTERISK}"#, input: "123***xyz", match: "*")
@@ -589,59 +591,40 @@ extension RegexTests {
589591
input: "\u{7}\u{1b}\u{a}\n\r\t abc", match: "a")
590592
}
591593

592-
func testMatchGroups() {
593-
// MARK: Groups
594-
595-
// Named captures
596-
matchTest(
597-
#"a(?<label>b)c"#, input: "123abcxyz", match: "abc")
598-
matchTest(
599-
#"a(?'label'b)c"#, input: "123abcxyz", match: "abc")
600-
matchTest(
601-
#"a(?P<label>b)c"#, input: "123abcxyz", match: "abc")
602-
603-
// Other groups
604-
matchTest(
605-
#"a(?:b)c"#, input: "123abcxyz", match: "abc")
606-
matchTest(
607-
"(?|(a)|(b)|(c))", input: "123abcxyz", match: "a")
608-
609-
matchTest(
610-
#"(?:a|.b)c"#, input: "123abcacxyz", match: "abc")
611-
matchTest(
612-
#"(?>a|.b)c"#, input: "123abcacxyz", match: "ac", xfail: true)
613-
matchTest(
614-
"(*atomic:a|.b)c", input: "123abcacxyz", match: "ac", xfail: true)
615-
matchTest(
616-
#"(?:a+)[a-z]c"#, input: "123aacacxyz", match: "aac")
617-
matchTest(
618-
#"(?>a+)[a-z]c"#, input: "123aacacxyz", match: "ac", xfail: true)
619-
594+
func testAssertions() {
620595
matchTest(
621596
#"\d+(?= dollars)"#,
622-
input: "Price: 100 dollars", match: "100", xfail: true)
597+
input: "Price: 100 dollars", match: "100")
598+
matchTest(
599+
#"\d+(?= pesos)"#,
600+
input: "Price: 100 dollars", match: nil)
623601
matchTest(
624602
#"(?=\d+ dollars)\d+"#,
625-
input: "Price: 100 dollars", match: "100", xfail: true)
603+
input: "Price: 100 dollars", match: "100",
604+
xfail: true) // TODO
605+
626606
matchTest(
627607
#"\d+(*pla: dollars)"#,
628-
input: "Price: 100 dollars", match: "100", xfail: true)
608+
input: "Price: 100 dollars", match: "100")
629609
matchTest(
630610
#"\d+(*positive_lookahead: dollars)"#,
631-
input: "Price: 100 dollars", match: "100", xfail: true)
611+
input: "Price: 100 dollars", match: "100")
632612

633613
matchTest(
634614
#"\d+(?! dollars)"#,
635-
input: "Price: 100 pesos", match: "100", xfail: true)
615+
input: "Price: 100 pesos", match: "100")
616+
matchTest(
617+
#"\d+(?! dollars)"#,
618+
input: "Price: 100 dollars", match: "10")
636619
matchTest(
637620
#"(?!\d+ dollars)\d+"#,
638-
input: "Price: 100 pesos", match: "100", xfail: true)
621+
input: "Price: 100 pesos", match: "100")
639622
matchTest(
640623
#"\d+(*nla: dollars)"#,
641-
input: "Price: 100 pesos", match: "100", xfail: true)
624+
input: "Price: 100 pesos", match: "100")
642625
matchTest(
643626
#"\d+(*negative_lookahead: dollars)"#,
644-
input: "Price: 100 pesos", match: "100", xfail: true)
627+
input: "Price: 100 pesos", match: "100")
645628

646629
matchTest(
647630
#"(?<=USD)\d+"#, input: "Price: USD100", match: "100", xfail: true)
@@ -664,6 +647,36 @@ extension RegexTests {
664647
// engines generally enforce that lookbehinds are fixed width
665648
matchTest(
666649
#"\d{3}(?<!USD\d{3})"#, input: "Price: JYP100", match: "100", xfail: true)
650+
}
651+
652+
func testMatchGroups() {
653+
// MARK: Groups
654+
655+
// Named captures
656+
matchTest(
657+
#"a(?<label>b)c"#, input: "123abcxyz", match: "abc")
658+
matchTest(
659+
#"a(?'label'b)c"#, input: "123abcxyz", match: "abc")
660+
matchTest(
661+
#"a(?P<label>b)c"#, input: "123abcxyz", match: "abc")
662+
663+
// Other groups
664+
matchTest(
665+
#"a(?:b)c"#, input: "123abcxyz", match: "abc")
666+
matchTest(
667+
"(?|(a)|(b)|(c))", input: "123abcxyz", match: "a")
668+
669+
matchTest(
670+
#"(?:a|.b)c"#, input: "123abcacxyz", match: "abc")
671+
matchTest(
672+
#"(?>a|.b)c"#, input: "123abcacxyz", match: "ac", xfail: true)
673+
matchTest(
674+
"(*atomic:a|.b)c", input: "123abcacxyz", match: "ac", xfail: true)
675+
matchTest(
676+
#"(?:a+)[a-z]c"#, input: "123aacacxyz", match: "aac")
677+
matchTest(
678+
#"(?>a+)[a-z]c"#, input: "123aacacxyz", match: "ac", xfail: true)
679+
667680

668681
// TODO: Test example where non-atomic is significant
669682
matchTest(

0 commit comments

Comments
 (0)