Skip to content

Commit df6be0c

Browse files
committed
Parse .NET balanced captures
This requires imposing some restrictions on what can be used as a group name to allow for the syntax `(?<a-b>)`. For now, restrict the characters to letters, numbers and `_`, and forbid the first character from being a number. This should be no stricter than the rules imposed by PCRE, Oniguruma, ICU, Java and .NET.
1 parent 4254dd8 commit df6be0c

File tree

11 files changed

+199
-57
lines changed

11 files changed

+199
-57
lines changed

Sources/_MatchingEngine/Regex/AST/Group.swift

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ extension AST {
3131
// (?<name>...) (?'name'...) (?P<name>...)
3232
case namedCapture(Located<String>)
3333

34+
// (?<name-priorName>) (?'name-priorName')
35+
case balancedCapture(BalancedCapture)
36+
3437
// (?:...)
3538
case nonCapture
3639

@@ -79,7 +82,7 @@ extension AST {
7982
extension AST.Group.Kind {
8083
public var isCapturing: Bool {
8184
switch self {
82-
case .capture, .namedCapture: return true
85+
case .capture, .namedCapture, .balancedCapture: return true
8386
default: return false
8487
}
8588
}
@@ -103,6 +106,7 @@ extension AST.Group.Kind {
103106
public var name: String? {
104107
switch self {
105108
case .namedCapture(let name): return name.value
109+
case .balancedCapture(let b): return b.name?.value
106110
default: return nil
107111
}
108112
}
@@ -121,5 +125,26 @@ extension AST.Group {
121125
default: return nil
122126
}
123127
}
128+
}
129+
130+
extension AST.Group {
131+
public struct BalancedCapture: Hashable {
132+
/// The name of the group, or nil if the group has no name.
133+
public var name: AST.Located<String>?
134+
135+
/// The location of the `-` in the group.
136+
public var dash: SourceLocation
124137

138+
/// The name of the prior group that the balancing group references.
139+
public var priorName: AST.Located<String>
140+
141+
public init(
142+
name: AST.Located<String>?, dash: SourceLocation,
143+
priorName: AST.Located<String>
144+
) {
145+
self.name = name
146+
self.dash = dash
147+
self.priorName = priorName
148+
}
149+
}
125150
}

Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,10 @@ extension AST {
4444
return .atom() + innerCaptures
4545
case .namedCapture(let name):
4646
return .atom(name: name.value) + innerCaptures
47+
case .balancedCapture(let b):
48+
return .atom(name: b.name?.value) + innerCaptures
4749
default:
50+
precondition(!group.kind.value.isCapturing)
4851
return innerCaptures
4952
}
5053
case .conditional(let c):

Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ enum ParseError: Error, Hashable {
5555
case emptyProperty
5656

5757
case expectedGroupSpecifier
58+
case expectedGroupName
59+
case groupNameMustBeAlphaNumeric
60+
case groupNameCannotStartWithNumber
5861
case cannotRemoveTextSegmentOptions
5962
}
6063

@@ -113,6 +116,12 @@ extension ParseError: CustomStringConvertible {
113116
return "empty property"
114117
case .expectedGroupSpecifier:
115118
return "expected group specifier"
119+
case .expectedGroupName:
120+
return "expected group name"
121+
case .groupNameMustBeAlphaNumeric:
122+
return "group name must only contain alphanumeric characters"
123+
case .groupNameCannotStartWithNumber:
124+
return "group name must not start with number"
116125
case .cannotRemoveTextSegmentOptions:
117126
return "text segment mode cannot be unset, only changed"
118127
}

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 68 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -659,13 +659,61 @@ extension Source {
659659
}
660660
}
661661

662+
/// Consume a group name.
663+
private mutating func expectGroupName(
664+
endingWith ending: String, eatEnding: Bool = true
665+
) throws -> Located<String> {
666+
let str = try recordLoc { src -> String in
667+
if src.isEmpty || src.tryEat(sequence: ending) {
668+
throw ParseError.expectedGroupName
669+
}
670+
if src.peek()!.isNumber {
671+
throw ParseError.groupNameCannotStartWithNumber
672+
}
673+
guard let str = src.tryEatPrefix(\.isWordCharacter)?.string else {
674+
throw ParseError.groupNameMustBeAlphaNumeric
675+
}
676+
return str
677+
}
678+
if eatEnding {
679+
try expect(sequence: ending)
680+
}
681+
return str
682+
}
683+
684+
/// Consume a named group field, producing either a named capture or balanced
685+
/// capture.
686+
///
687+
/// NamedGroup -> 'P<' GroupNameBody '>'
688+
/// | '<' GroupNameBody '>'
689+
/// | "'" GroupNameBody "'"
690+
/// GroupNameBody -> \w+ | \w* '-' \w+
691+
///
692+
private mutating func expectNamedGroup(
693+
endingWith ending: String
694+
) throws -> AST.Group.Kind {
695+
func lexBalanced(_ lhs: Located<String>? = nil) throws -> AST.Group.Kind? {
696+
// If we have a '-', this is a .NET-style 'balanced group'.
697+
guard let dash = tryEatWithLoc("-") else { return nil }
698+
let rhs = try expectGroupName(endingWith: ending)
699+
return .balancedCapture(.init(name: lhs, dash: dash, priorName: rhs))
700+
}
701+
702+
// Lex a group name, trying to lex a '-rhs' for a balanced capture group
703+
// both before and after.
704+
if let b = try lexBalanced() { return b }
705+
let name = try expectGroupName(endingWith: ending, eatEnding: false)
706+
if let b = try lexBalanced(name) { return b }
707+
708+
try expect(sequence: ending)
709+
return .namedCapture(name)
710+
}
711+
662712
/// Try to consume the start of a group
663713
///
664714
/// GroupStart -> '(?' GroupKind | '('
665-
/// GroupKind -> Named | ':' | '|' | '>' | '=' | '!' | '*' | '<=' | '<!'
666-
/// | '<*' | MatchingOptionSeq (':' | ')')
667-
/// Named -> '<' [^'>']+ '>' | 'P<' [^'>']+ '>'
668-
/// | '\'' [^'\'']+ '\''
715+
/// GroupKind -> ':' | '|' | '>' | '=' | '!' | '*' | '<=' | '<!' | '<*'
716+
/// | NamedGroup | MatchingOptionSeq (':' | ')')
669717
///
670718
/// If `SyntaxOptions.experimentalGroups` is enabled, also accepts:
671719
///
@@ -709,16 +757,11 @@ extension Source {
709757
if src.tryEat(sequence: "<*") { return .nonAtomicLookbehind }
710758

711759
// Named
712-
// TODO: Group name validation, PCRE (and ICU + Oniguruma as far as I
713-
// can tell), enforce word characters only, with the first character
714-
// being a non-digit.
715760
if src.tryEat("<") || src.tryEat(sequence: "P<") {
716-
let name = try src.expectQuoted(endingWith: ">")
717-
return .namedCapture(name)
761+
return try src.expectNamedGroup(endingWith: ">")
718762
}
719763
if src.tryEat("'") {
720-
let name = try src.expectQuoted(endingWith: "'")
721-
return .namedCapture(name)
764+
return try src.expectNamedGroup(endingWith: "'")
722765
}
723766

724767
// Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:).
@@ -853,9 +896,9 @@ extension Source {
853896
// FIXME: This should apply to future groups too.
854897
// TODO: We should probably advise users to use the more explicit
855898
// syntax.
856-
let nameRef = try src.expectNamedReference(
857-
endingWith: ")", eatEnding: false)
858-
if context.isPriorGroupRef(nameRef.kind) {
899+
if let nameRef = src.lexNamedReference(endingWith: ")",
900+
eatEnding: false),
901+
context.isPriorGroupRef(nameRef.kind) {
859902
return .groupMatched(nameRef)
860903
}
861904
return nil
@@ -1046,11 +1089,20 @@ extension Source {
10461089
private mutating func expectNamedReference(
10471090
endingWith end: String, eatEnding: Bool = true
10481091
) throws -> AST.Reference {
1049-
// TODO: Group name validation, see comment in lexGroupStart.
1050-
let str = try expectQuoted(endingWith: end, eatEnding: eatEnding)
1092+
let str = try expectGroupName(endingWith: end, eatEnding: eatEnding)
10511093
return .init(.named(str.value), innerLoc: str.location)
10521094
}
10531095

1096+
/// Try to consume a named reference up to a closing delimiter, returning
1097+
/// `nil` if the characters aren't valid for a named reference.
1098+
private mutating func lexNamedReference(
1099+
endingWith end: String, eatEnding: Bool = true
1100+
) -> AST.Reference? {
1101+
tryEating { src in
1102+
try? src.expectNamedReference(endingWith: end, eatEnding: eatEnding)
1103+
}
1104+
}
1105+
10541106
/// Try to lex a numbered reference, or otherwise a named reference.
10551107
///
10561108
/// NameOrNumberRef -> NumberRef | <String>

Sources/_MatchingEngine/Regex/Printing/DumpAST.swift

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -156,19 +156,20 @@ extension AST.Reference: _ASTPrintable {
156156
extension AST.Group.Kind: _ASTPrintable {
157157
public var _dumpBase: String {
158158
switch self {
159-
case .capture: return "capture"
160-
case .namedCapture(let s): return "capture<\(s.value)>"
161-
case .nonCapture: return "nonCapture"
162-
case .nonCaptureReset: return "nonCaptureReset"
163-
case .atomicNonCapturing: return "atomicNonCapturing"
164-
case .lookahead: return "lookahead"
165-
case .negativeLookahead: return "negativeLookahead"
166-
case .nonAtomicLookahead: return "nonAtomicLookahead"
167-
case .lookbehind: return "lookbehind"
168-
case .negativeLookbehind: return "negativeLookbehind"
169-
case .nonAtomicLookbehind: return "nonAtomicLookbehind"
170-
case .scriptRun: return "scriptRun"
171-
case .atomicScriptRun: return "atomicScriptRun"
159+
case .capture: return "capture"
160+
case .namedCapture(let s): return "capture<\(s.value)>"
161+
case .balancedCapture(let b): return "balanced capture \(b)"
162+
case .nonCapture: return "nonCapture"
163+
case .nonCaptureReset: return "nonCaptureReset"
164+
case .atomicNonCapturing: return "atomicNonCapturing"
165+
case .lookahead: return "lookahead"
166+
case .negativeLookahead: return "negativeLookahead"
167+
case .nonAtomicLookahead: return "nonAtomicLookahead"
168+
case .lookbehind: return "lookbehind"
169+
case .negativeLookbehind: return "negativeLookbehind"
170+
case .nonAtomicLookbehind: return "nonAtomicLookbehind"
171+
case .scriptRun: return "scriptRun"
172+
case .atomicScriptRun: return "atomicScriptRun"
172173
case .changeMatchingOptions(let seq, let isIsolated):
173174
return "changeMatchingOptions<\(seq), \(isIsolated)>"
174175
}
@@ -257,3 +258,9 @@ extension AST.Atom.BacktrackingDirective: _ASTPrintable {
257258
return result
258259
}
259260
}
261+
262+
extension AST.Group.BalancedCapture: _ASTPrintable {
263+
public var _dumpBase: String {
264+
"\(name?.value ?? "")-\(priorName.value)"
265+
}
266+
}

Sources/_MatchingEngine/Regex/Printing/PrintAsCanonical.swift

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -138,19 +138,20 @@ extension AST.Quote {
138138
extension AST.Group.Kind {
139139
var _canonicalBase: String {
140140
switch self {
141-
case .capture: return "("
142-
case .namedCapture(let n): return "(?<\(n.value)>"
143-
case .nonCapture: return "(?:"
144-
case .nonCaptureReset: return "(?|"
145-
case .atomicNonCapturing: return "(?>"
146-
case .lookahead: return "(?="
147-
case .negativeLookahead: return "(?!"
148-
case .nonAtomicLookahead: return "(?*"
149-
case .lookbehind: return "(?<="
150-
case .negativeLookbehind: return "(?<!"
151-
case .nonAtomicLookbehind: return "(?<*"
152-
case .scriptRun: return "(*sr:"
153-
case .atomicScriptRun: return "(*asr:"
141+
case .capture: return "("
142+
case .namedCapture(let n): return "(?<\(n.value)>"
143+
case .balancedCapture(let b): return "(?<\(b._canonicalBase)>"
144+
case .nonCapture: return "(?:"
145+
case .nonCaptureReset: return "(?|"
146+
case .atomicNonCapturing: return "(?>"
147+
case .lookahead: return "(?="
148+
case .negativeLookahead: return "(?!"
149+
case .nonAtomicLookahead: return "(?*"
150+
case .lookbehind: return "(?<="
151+
case .negativeLookbehind: return "(?<!"
152+
case .nonAtomicLookbehind: return "(?<*"
153+
case .scriptRun: return "(*sr:"
154+
case .atomicScriptRun: return "(*asr:"
154155

155156
case .changeMatchingOptions:
156157
return "(/* TODO: matchign options in canonical form */"
@@ -220,3 +221,9 @@ extension AST.Reference {
220221
extension AST.CustomCharacterClass.Start {
221222
var _canonicalBase: String { self.rawValue }
222223
}
224+
225+
extension AST.Group.BalancedCapture {
226+
var _canonicalBase: String {
227+
"\(name?.value ?? "")-\(priorName.value)"
228+
}
229+
}

Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,9 @@ extension AST.Group.Kind {
286286
case .namedCapture(let n):
287287
return "name: \"\(n)\""
288288

289+
case .balancedCapture:
290+
return "/* TODO: balanced captures */"
291+
289292
case .nonCapture: return ""
290293

291294
case .nonCaptureReset:

Sources/_MatchingEngine/Utility/MissingUnicode.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -657,5 +657,7 @@ public enum OnigurumaSpecialProperty: String, Hashable {
657657
}
658658

659659
extension Character {
660-
var isOctalDigit: Bool { ("0"..."7").contains(self) }
660+
public var isOctalDigit: Bool { ("0"..."7").contains(self) }
661+
662+
public var isWordCharacter: Bool { isLetter || isNumber || self == "_" }
661663
}

Sources/_StringProcessing/ASTBuilder.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,12 @@ func namedCapture(
6868
) -> AST {
6969
group(.namedCapture(.init(faking: name)), child)
7070
}
71+
func balancedCapture(name: String?, priorName: String, _ child: AST) -> AST {
72+
group(.balancedCapture(
73+
.init(name: name.map { .init(faking: $0) }, dash: .fake,
74+
priorName: .init(faking: priorName))
75+
), child)
76+
}
7177
func nonCaptureReset(
7278
_ child: AST
7379
) -> AST {

Sources/_StringProcessing/CharacterClass.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ public struct CharacterClass: Hashable {
148148
case .newlineSequence: matched = c.isNewline
149149
case .verticalWhitespace: fatalError("Not implemented")
150150
case .whitespace: matched = c.isWhitespace
151-
case .word: matched = c.isLetter || c.isNumber || c == "_"
151+
case .word: matched = c.isWordCharacter
152152
case .custom(let set): matched = set.any { $0.matches(c) }
153153
}
154154
if isInverted {

0 commit comments

Comments
 (0)