Skip to content

Commit fbdc959

Browse files
authored
Merge pull request #129 from hamishknight/oniguruma-calling
2 parents 2098c07 + f6a0240 commit fbdc959

16 files changed

+830
-100
lines changed

Sources/_MatchingEngine/Regex/AST/AST.swift

Lines changed: 58 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ public indirect enum AST:
3838

3939
case customCharacterClass(CustomCharacterClass)
4040

41+
case absentFunction(AbsentFunction)
42+
4143
case empty(Empty)
4244

4345
// FIXME: Move off the regex literal AST
@@ -55,16 +57,17 @@ extension AST {
5557
// over `self` _everywhere_ we want to do anything.
5658
var _associatedValue: _ASTNode {
5759
switch self {
58-
case let .alternation(v): return v
59-
case let .concatenation(v): return v
60-
case let .group(v): return v
61-
case let .conditional(v): return v
62-
case let .quantification(v): return v
63-
case let .quote(v): return v
64-
case let .trivia(v): return v
65-
case let .atom(v): return v
66-
case let .customCharacterClass(v): return v
67-
case let .empty(v): return v
60+
case let .alternation(v): return v
61+
case let .concatenation(v): return v
62+
case let .group(v): return v
63+
case let .conditional(v): return v
64+
case let .quantification(v): return v
65+
case let .quote(v): return v
66+
case let .trivia(v): return v
67+
case let .atom(v): return v
68+
case let .customCharacterClass(v): return v
69+
case let .empty(v): return v
70+
case let .absentFunction(v): return v
6871

6972
case let .groupTransform(g, _):
7073
return g // FIXME: get this out of here
@@ -110,7 +113,7 @@ extension AST {
110113
switch self {
111114
case .atom(let a):
112115
return a.isQuantifiable
113-
case .group, .conditional, .customCharacterClass:
116+
case .group, .conditional, .customCharacterClass, .absentFunction:
114117
return true
115118
case .alternation, .concatenation, .quantification, .quote, .trivia,
116119
.empty, .groupTransform:
@@ -185,6 +188,50 @@ extension AST {
185188
}
186189
}
187190

191+
/// An Oniguruma absent function. This is used to model a pattern which should
192+
/// not be matched against across varying scopes.
193+
public struct AbsentFunction: Hashable, _ASTNode {
194+
public enum Start: Hashable {
195+
/// `(?~|`
196+
case withPipe
197+
198+
/// `(?~`
199+
case withoutPipe
200+
}
201+
public enum Kind: Hashable {
202+
/// An absent repeater `(?~absent)`. This is equivalent to `(?~|absent|.*)`
203+
/// and therefore matches as long as the pattern `absent` is not matched.
204+
case repeater(AST)
205+
206+
/// An absent expression `(?~|absent|expr)`, which defines an `absent`
207+
/// pattern which must not be matched against while the pattern `expr` is
208+
/// matched.
209+
case expression(absentee: AST, pipe: SourceLocation, expr: AST)
210+
211+
/// An absent stopper `(?~|absent)`, which prevents matching against
212+
/// `absent` until the end of the regex, or until it is cleared.
213+
case stopper(AST)
214+
215+
/// An absent clearer `(?~|)` which cancels the effect of an absent
216+
/// stopper.
217+
case clearer
218+
}
219+
/// The location of `(?~` or `(?~|`
220+
public var start: SourceLocation
221+
222+
public var kind: Kind
223+
224+
public var location: SourceLocation
225+
226+
public init(
227+
_ kind: Kind, start: SourceLocation, location: SourceLocation
228+
) {
229+
self.kind = kind
230+
self.start = start
231+
self.location = location
232+
}
233+
}
234+
188235
public struct Reference: Hashable {
189236
@frozen
190237
public enum Kind: Hashable {

Sources/_MatchingEngine/Regex/AST/ASTProtocols.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,12 @@ extension AST.Group: _ASTParent {
4040
extension AST.Quantification: _ASTParent {
4141
var children: [AST] { [child] }
4242
}
43+
extension AST.AbsentFunction: _ASTParent {
44+
var children: [AST] {
45+
switch kind {
46+
case .repeater(let a), .stopper(let a): return [a]
47+
case .expression(let a, _, let c): return [a, c]
48+
case .clearer: return []
49+
}
50+
}
51+
}

Sources/_MatchingEngine/Regex/AST/Atom.swift

Lines changed: 111 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -476,14 +476,117 @@ extension AST.Atom {
476476
}
477477

478478
extension AST.Atom {
479-
public struct Callout: Hashable {
480-
public enum Argument: Hashable {
481-
case number(Int)
482-
case string(String)
479+
public enum Callout: Hashable {
480+
/// A PCRE callout written `(?C...)`
481+
public struct PCRE: Hashable {
482+
public enum Argument: Hashable {
483+
case number(Int)
484+
case string(String)
485+
}
486+
public var arg: AST.Located<Argument>
487+
488+
public init(_ arg: AST.Located<Argument>) {
489+
self.arg = arg
490+
}
491+
492+
/// Whether the argument isn't written explicitly in the source, e.g
493+
/// `(?C)` which is implicitly `(?C0)`.
494+
public var isImplicit: Bool { arg.location.isEmpty }
483495
}
484-
public var arg: AST.Located<Argument>
485-
public init(_ arg: AST.Located<Argument>) {
486-
self.arg = arg
496+
497+
/// A named Oniguruma callout written `(*name[tag]{args, ...})`
498+
public struct OnigurumaNamed: Hashable {
499+
public struct ArgList: Hashable {
500+
public var leftBrace: SourceLocation
501+
public var args: [AST.Located<String>]
502+
public var rightBrace: SourceLocation
503+
504+
public init(
505+
_ leftBrace: SourceLocation,
506+
_ args: [AST.Located<String>],
507+
_ rightBrace: SourceLocation
508+
) {
509+
self.leftBrace = leftBrace
510+
self.args = args
511+
self.rightBrace = rightBrace
512+
}
513+
}
514+
515+
public var name: AST.Located<String>
516+
public var tag: OnigurumaTag?
517+
public var args: ArgList?
518+
519+
public init(
520+
_ name: AST.Located<String>, tag: OnigurumaTag?, args: ArgList?
521+
) {
522+
self.name = name
523+
self.tag = tag
524+
self.args = args
525+
}
526+
}
527+
528+
/// An Oniguruma callout 'of contents', written `(?{...}[tag]D)`
529+
public struct OnigurumaOfContents: Hashable {
530+
public enum Direction: Hashable {
531+
case inProgress // > (the default)
532+
case inRetraction // <
533+
case both // X
534+
}
535+
public var openBraces: SourceLocation
536+
public var contents: AST.Located<String>
537+
public var closeBraces: SourceLocation
538+
public var tag: OnigurumaTag?
539+
public var direction: AST.Located<Direction>
540+
541+
public init(
542+
_ openBraces: SourceLocation, _ contents: AST.Located<String>,
543+
_ closeBraces: SourceLocation, tag: OnigurumaTag?,
544+
direction: AST.Located<Direction>
545+
) {
546+
self.openBraces = openBraces
547+
self.contents = contents
548+
self.closeBraces = closeBraces
549+
self.tag = tag
550+
self.direction = direction
551+
}
552+
553+
/// Whether the direction flag isn't written explicitly in the
554+
/// source, e.g `(?{x})` which is implicitly `(?{x}>)`.
555+
public var isDirectionImplicit: Bool { direction.location.isEmpty }
556+
}
557+
case pcre(PCRE)
558+
case onigurumaNamed(OnigurumaNamed)
559+
case onigurumaOfContents(OnigurumaOfContents)
560+
561+
private var _associatedValue: Any {
562+
switch self {
563+
case .pcre(let v): return v
564+
case .onigurumaNamed(let v): return v
565+
case .onigurumaOfContents(let v): return v
566+
}
567+
}
568+
569+
func `as`<T>(_ t: T.Type = T.self) -> T? {
570+
_associatedValue as? T
571+
}
572+
}
573+
}
574+
575+
extension AST.Atom.Callout {
576+
/// A tag specifier `[...]` which may appear in an Oniguruma callout.
577+
public struct OnigurumaTag: Hashable {
578+
public var leftBracket: SourceLocation
579+
public var name: AST.Located<String>
580+
public var rightBracket: SourceLocation
581+
582+
public init(
583+
_ leftBracket: SourceLocation,
584+
_ name: AST.Located<String>,
585+
_ rightBracket: SourceLocation
586+
) {
587+
self.leftBracket = leftBracket
588+
self.name = name
589+
self.rightBracket = rightBracket
487590
}
488591
}
489592
}
@@ -594,7 +697,7 @@ extension AST {
594697
case .alternation, .concatenation, .group,
595698
.conditional, .quantification, .quote,
596699
.trivia, .customCharacterClass, .empty,
597-
.groupTransform:
700+
.groupTransform, .absentFunction:
598701
return nil
599702
}
600703
}

Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,15 @@ extension AST {
8181
quantification.amount.value == .zeroOrOne
8282
? CaptureStructure.optional
8383
: CaptureStructure.array)
84+
case .absentFunction(let abs):
85+
// Only the child of an expression absent function is relevant, as the
86+
// other expressions don't actually get matched against.
87+
switch abs.kind {
88+
case .expression(_, _, let child):
89+
return child.captureStructure
90+
case .clearer, .repeater, .stopper:
91+
return .empty
92+
}
8493
case .quote, .trivia, .atom, .customCharacterClass, .empty:
8594
return .empty
8695
}

Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ enum ParseError: Error, Hashable {
3131
case tooManyBranchesInConditional(Int)
3232
case unsupportedCondition(String)
3333

34+
case tooManyAbsentExpressionChildren(Int)
35+
3436
case expectedASCII(Character)
3537

3638
case expectedNonEmptyContents
@@ -55,10 +57,25 @@ enum ParseError: Error, Hashable {
5557
case emptyProperty
5658

5759
case expectedGroupSpecifier
58-
case expectedGroupName
59-
case groupNameMustBeAlphaNumeric
60-
case groupNameCannotStartWithNumber
60+
case unbalancedEndOfGroup
61+
62+
// Identifier diagnostics.
63+
case expectedIdentifier(IdentifierKind)
64+
case identifierMustBeAlphaNumeric(IdentifierKind)
65+
case identifierCannotStartWithNumber(IdentifierKind)
66+
6167
case cannotRemoveTextSegmentOptions
68+
case expectedCalloutArgument
69+
}
70+
71+
extension IdentifierKind {
72+
fileprivate var diagDescription: String {
73+
switch self {
74+
case .groupName: return "group name"
75+
case .onigurumaCalloutName: return "callout name"
76+
case .onigurumaCalloutTag: return "callout tag"
77+
}
78+
}
6279
}
6380

6481
extension ParseError: CustomStringConvertible {
@@ -96,6 +113,8 @@ extension ParseError: CustomStringConvertible {
96113
return "expected 2 branches in conditional, have \(i)"
97114
case let .unsupportedCondition(str):
98115
return "\(str) cannot be used as condition"
116+
case let .tooManyAbsentExpressionChildren(i):
117+
return "expected 2 expressions in absent expression, have \(i)"
99118
case let .unknownGroupKind(str):
100119
return "unknown group kind '(\(str)'"
101120
case let .unknownCalloutKind(str):
@@ -116,14 +135,18 @@ extension ParseError: CustomStringConvertible {
116135
return "empty property"
117136
case .expectedGroupSpecifier:
118137
return "expected group specifier"
119-
case .expectedGroupName:
120-
return "expected group name"
121-
case .groupNameMustBeAlphaNumeric:
122-
return "group name must only contain alphanumeric characters"
123-
case .groupNameCannotStartWithNumber:
124-
return "group name must not start with number"
138+
case .unbalancedEndOfGroup:
139+
return "closing ')' does not balance any groups openings"
140+
case .expectedIdentifier(let i):
141+
return "expected \(i.diagDescription)"
142+
case .identifierMustBeAlphaNumeric(let i):
143+
return "\(i.diagDescription) must only contain alphanumeric characters"
144+
case .identifierCannotStartWithNumber(let i):
145+
return "\(i.diagDescription) must not start with number"
125146
case .cannotRemoveTextSegmentOptions:
126147
return "text segment mode cannot be unset, only changed"
148+
case .expectedCalloutArgument:
149+
return "expected argument to callout"
127150
}
128151
}
129152
}

0 commit comments

Comments
 (0)