Skip to content

Commit fdd98c6

Browse files
committed
Parse Oniguruma absent function
Parse the 4 varieties of absent function syntax supported by Oniguruma.
1 parent 23403a9 commit fdd98c6

File tree

15 files changed

+265
-16
lines changed

15 files changed

+265
-16
lines changed

Sources/_MatchingEngine/Regex/AST/AST.swift

Lines changed: 51 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ public indirect enum AST:
3838

3939
case customCharacterClass(CustomCharacterClass)
4040

41+
case absentFunction(AbsentFunction)
42+
4143
case empty(Empty)
4244

4345
// FIXME: Move off the regex literal AST
@@ -55,16 +57,17 @@ extension AST {
5557
// over `self` _everywhere_ we want to do anything.
5658
var _associatedValue: _ASTNode {
5759
switch self {
58-
case let .alternation(v): return v
59-
case let .concatenation(v): return v
60-
case let .group(v): return v
61-
case let .conditional(v): return v
62-
case let .quantification(v): return v
63-
case let .quote(v): return v
64-
case let .trivia(v): return v
65-
case let .atom(v): return v
66-
case let .customCharacterClass(v): return v
67-
case let .empty(v): return v
60+
case let .alternation(v): return v
61+
case let .concatenation(v): return v
62+
case let .group(v): return v
63+
case let .conditional(v): return v
64+
case let .quantification(v): return v
65+
case let .quote(v): return v
66+
case let .trivia(v): return v
67+
case let .atom(v): return v
68+
case let .customCharacterClass(v): return v
69+
case let .empty(v): return v
70+
case let .absentFunction(v): return v
6871

6972
case let .groupTransform(g, _):
7073
return g // FIXME: get this out of here
@@ -110,7 +113,7 @@ extension AST {
110113
switch self {
111114
case .atom(let a):
112115
return a.isQuantifiable
113-
case .group, .conditional, .customCharacterClass:
116+
case .group, .conditional, .customCharacterClass, .absentFunction:
114117
return true
115118
case .alternation, .concatenation, .quantification, .quote, .trivia,
116119
.empty, .groupTransform:
@@ -185,6 +188,43 @@ extension AST {
185188
}
186189
}
187190

191+
public struct AbsentFunction: Hashable, _ASTNode {
192+
public enum Start: Hashable {
193+
/// (?~|
194+
case withPipe
195+
196+
/// (?~
197+
case withoutPipe
198+
}
199+
public enum Kind: Hashable {
200+
/// `(?~absent)`
201+
case repeater(AST)
202+
203+
/// `(?~|absent|expr)`
204+
case expression(absentee: AST, pipe: SourceLocation, expr: AST)
205+
206+
/// `(?~|absent)`
207+
case stopper(AST)
208+
209+
/// `(?~|)`
210+
case clearer
211+
}
212+
/// The location of `(?~` or `(?~|`
213+
public var start: SourceLocation
214+
215+
public var kind: Kind
216+
217+
public var location: SourceLocation
218+
219+
public init(
220+
_ kind: Kind, start: SourceLocation, location: SourceLocation
221+
) {
222+
self.kind = kind
223+
self.start = start
224+
self.location = location
225+
}
226+
}
227+
188228
public struct Reference: Hashable {
189229
@frozen
190230
public enum Kind: Hashable {

Sources/_MatchingEngine/Regex/AST/ASTProtocols.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,12 @@ extension AST.Group: _ASTParent {
4040
extension AST.Quantification: _ASTParent {
4141
var children: [AST] { [child] }
4242
}
43+
extension AST.AbsentFunction: _ASTParent {
44+
var children: [AST] {
45+
switch kind {
46+
case .repeater(let a), .stopper(let a): return [a]
47+
case .expression(let a, _, let c): return [a, c]
48+
case .clearer: return []
49+
}
50+
}
51+
}

Sources/_MatchingEngine/Regex/AST/Atom.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -697,7 +697,7 @@ extension AST {
697697
case .alternation, .concatenation, .group,
698698
.conditional, .quantification, .quote,
699699
.trivia, .customCharacterClass, .empty,
700-
.groupTransform:
700+
.groupTransform, .absentFunction:
701701
return nil
702702
}
703703
}

Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,15 @@ extension AST {
8181
quantification.amount.value == .zeroOrOne
8282
? CaptureStructure.optional
8383
: CaptureStructure.array)
84+
case .absentFunction(let abs):
85+
// Only the child of an expression absent function is relevant, as the
86+
// other expressions don't actually get matched against.
87+
switch abs.kind {
88+
case .expression(_, _, let child):
89+
return child.captureStructure
90+
case .clearer, .repeater, .stopper:
91+
return .empty
92+
}
8493
case .quote, .trivia, .atom, .customCharacterClass, .empty:
8594
return .empty
8695
}

Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ enum ParseError: Error, Hashable {
3131
case tooManyBranchesInConditional(Int)
3232
case unsupportedCondition(String)
3333

34+
case tooManyAbsentExpressionChildren(Int)
35+
3436
case expectedASCII(Character)
3537

3638
case expectedNonEmptyContents
@@ -111,6 +113,8 @@ extension ParseError: CustomStringConvertible {
111113
return "expected 2 branches in conditional, have \(i)"
112114
case let .unsupportedCondition(str):
113115
return "\(str) cannot be used as condition"
116+
case let .tooManyAbsentExpressionChildren(i):
117+
return "expected 2 expressions in absent expression, have \(i)"
114118
case let .unknownGroupKind(str):
115119
return "unknown group kind '(\(str)'"
116120
case let .unknownCalloutKind(str):

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -967,6 +967,19 @@ extension Source {
967967
}
968968
}
969969

970+
/// Try to consume the start of an absent function.
971+
///
972+
/// AbsentFunctionStart -> '(?~' '|'?
973+
///
974+
mutating func lexAbsentFunctionStart(
975+
) -> Located<AST.AbsentFunction.Start>? {
976+
recordLoc { src in
977+
if src.tryEat(sequence: "(?~|") { return .withPipe }
978+
if src.tryEat(sequence: "(?~") { return .withoutPipe }
979+
return nil
980+
}
981+
}
982+
970983
mutating func lexCustomCCStart(
971984
) throws -> Located<CustomCC.Start>? {
972985
recordLoc { src in

Sources/_MatchingEngine/Regex/Parse/Parse.swift

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -262,11 +262,58 @@ extension Parser {
262262
return .init(kind, child, loc(start))
263263
}
264264

265+
/// Consume the body of an absent function.
266+
///
267+
/// AbsentFunction -> '(?~' RecursiveRegex ')'
268+
/// | '(?~|' Concatenation '|' Concatenation ')'
269+
/// | '(?~|' Concatenation ')'
270+
/// | '(?~|)'
271+
///
272+
mutating func parseAbsentFunctionBody(
273+
_ start: AST.Located<AST.AbsentFunction.Start>
274+
) throws -> AST.AbsentFunction {
275+
let startLoc = start.location
276+
277+
// TODO: Diagnose on nested absent functions, which Oniguruma states is
278+
// undefined behavior.
279+
let kind: AST.AbsentFunction.Kind
280+
switch start.value {
281+
case .withoutPipe:
282+
// Must be a repeater.
283+
kind = .repeater(try parseNode())
284+
case .withPipe where source.peek() == ")":
285+
kind = .clearer
286+
case .withPipe:
287+
// Can either be an expression or stopper depending on whether we have a
288+
// any additional '|'s.
289+
let child = try parseNode()
290+
switch child {
291+
case .alternation(let alt):
292+
// A pipe, so an expression.
293+
let numChildren = alt.children.count
294+
guard numChildren == 2 else {
295+
throw Source.LocatedError(
296+
ParseError.tooManyAbsentExpressionChildren(numChildren),
297+
child.location
298+
)
299+
}
300+
kind = .expression(
301+
absentee: alt.children[0], pipe: alt.pipes[0], expr: alt.children[1])
302+
default:
303+
// No pipes, so a stopper.
304+
kind = .stopper(child)
305+
}
306+
}
307+
try source.expect(")")
308+
return .init(kind, start: startLoc, location: loc(startLoc.start))
309+
}
310+
265311
/// Parse a (potentially quantified) component
266312
///
267-
/// QuantOperand -> Conditional | Group | CustomCharClass | Atom
268-
/// Group -> GroupStart Regex ')'
269-
/// Conditional -> ConditionalStart Concatenation ('|' Concatenation)? ')'
313+
/// QuantOperand -> Conditional | Group | CustomCharClass | Atom
314+
/// | AbsentFunction
315+
/// Group -> GroupStart RecursiveRegex ')'
316+
/// Conditional -> ConditionalStart Concatenation ('|' Concatenation)? ')'
270317
/// ConditionalStart -> KnownConditionalStart | GroupConditionalStart
271318
///
272319
mutating func parseQuantifierOperand() throws -> AST? {
@@ -286,6 +333,11 @@ extension Parser {
286333
start: _start, .init(.group(group), group.location))
287334
}
288335

336+
// Check if we have an Oniguruma absent function.
337+
if let start = source.lexAbsentFunctionStart() {
338+
return .absentFunction(try parseAbsentFunctionBody(start))
339+
}
340+
289341
// Check if we have the start of a group '('.
290342
if let kind = try source.lexGroupStart() {
291343
return .group(try parseGroupBody(start: _start, kind))

Sources/_MatchingEngine/Regex/Printing/DumpAST.swift

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,3 +314,24 @@ extension AST.Group.BalancedCapture: _ASTPrintable {
314314
"\(name?.value ?? "")-\(priorName.value)"
315315
}
316316
}
317+
318+
extension AST.AbsentFunction.Kind {
319+
public var _dumpBase: String {
320+
switch self {
321+
case .repeater:
322+
return "repeater"
323+
case .expression:
324+
return "expression"
325+
case .stopper:
326+
return "stopper"
327+
case .clearer:
328+
return "clearer"
329+
}
330+
}
331+
}
332+
333+
extension AST.AbsentFunction {
334+
public var _dumpBase: String {
335+
"absent function \(kind._dumpBase)"
336+
}
337+
}

Sources/_MatchingEngine/Regex/Printing/PrintAsCanonical.swift

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ extension PrettyPrinter {
8787
case let .customCharacterClass(ccc):
8888
outputAsCanonical(ccc)
8989

90+
case let .absentFunction(abs):
91+
outputAsCanonical(abs)
92+
9093
case .empty:
9194
output("")
9295

@@ -126,6 +129,25 @@ extension PrettyPrinter {
126129
mutating func outputAsCanonical(_ condition: AST.Conditional.Condition) {
127130
output("(/*TODO: conditional \(condition) */)")
128131
}
132+
133+
mutating func outputAsCanonical(_ abs: AST.AbsentFunction) {
134+
output("(?~")
135+
switch abs.kind {
136+
case .repeater(let a):
137+
outputAsCanonical(a)
138+
case .expression(let a, _, let child):
139+
output("|")
140+
outputAsCanonical(a)
141+
output("|")
142+
outputAsCanonical(child)
143+
case .stopper(let a):
144+
output("|")
145+
outputAsCanonical(a)
146+
case .clearer:
147+
output("|")
148+
}
149+
output(")")
150+
}
129151
}
130152

131153
extension AST.Quote {

Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,9 @@ extension PrettyPrinter {
128128
case let .customCharacterClass(ccc):
129129
printAsPattern(ccc)
130130

131+
case let .absentFunction(abs):
132+
print("/*TODO: absent function \(abs)*/")
133+
131134
case .empty: print("")
132135
case .groupTransform:
133136
print("// FIXME: get group transform out of here!")

Sources/_StringProcessing/ASTBuilder.swift

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,23 @@ func pcreCallout(_ arg: AST.Atom.Callout.PCRE.Argument) -> AST {
179179
atom(.callout(.pcre(.init(.init(faking: arg)))))
180180
}
181181

182+
func absentRepeater(_ child: AST) -> AST {
183+
.absentFunction(.init(.repeater(child), start: .fake, location: .fake))
184+
}
185+
func absentExpression(_ absentee: AST, _ child: AST) -> AST {
186+
.absentFunction(.init(
187+
.expression(absentee: absentee, pipe: .fake, expr: child),
188+
start: .fake, location: .fake
189+
))
190+
}
191+
func absentStopper(_ absentee: AST) -> AST {
192+
.absentFunction(.init(.stopper(absentee), start: .fake, location: .fake))
193+
194+
}
195+
func absentRangeClear() -> AST {
196+
.absentFunction(.init(.clearer, start: .fake, location: .fake))
197+
}
198+
182199
func onigurumaNamedCallout(
183200
_ name: String, tag: String? = nil, args: String...
184201
) -> AST {

Sources/_StringProcessing/Compiler.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ class Compiler {
9393
case .trivia, .empty:
9494
break
9595

96+
case .absentFunction:
97+
throw unsupported(node.renderAsCanonical())
98+
9699
case .group(let g):
97100
if let lookaround = g.lookaroundKind {
98101
try emitLookaround(lookaround, g.child)

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ extension AST {
5252
return try ccc.generateConsumer(opts)
5353
case .alternation, .conditional, .concatenation, .group,
5454
.quantification, .quote, .trivia, .empty,
55-
.groupTransform: return nil
55+
.groupTransform, .absentFunction: return nil
5656
}
5757
}
5858
}

Sources/_StringProcessing/Legacy/LegacyCompile.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,9 @@ func compile(
260260
case .conditional:
261261
throw unsupported(ast.renderAsCanonical())
262262

263+
case .absentFunction:
264+
throw unsupported(ast.renderAsCanonical())
265+
263266
case .customCharacterClass:
264267
fatalError("unreachable")
265268

0 commit comments

Comments
 (0)