Skip to content

Commit 32b6685

Browse files
authored
Parse matching options (#91)
* Parse matching options Parse both explicitly scoped `(?i:...)`, `(?^i:...)`, `(?i-m:...)` and implicitly scoped `(?i)`, `(?^i)`, `(?i-m)` matching option specifiers, with support for, PCRE, ICU, and Oniguruma options.
1 parent 0a7d4bb commit 32b6685

File tree

8 files changed

+434
-17
lines changed

8 files changed

+434
-17
lines changed

Sources/_MatchingEngine/Regex/AST/Group.swift

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,12 @@ extension AST {
5353
// (*asr:...)
5454
case atomicScriptRun
5555

56+
// (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:)
57+
// If hasImplicitScope is true, it was written as e.g (?i), and implicitly
58+
// forms a group containing all the following elements of the current
59+
// group.
60+
case changeMatchingOptions(MatchingOptionSequence, hasImplicitScope: Bool)
61+
5662
// NOTE: Comments appear to be groups, but are not parsed
5763
// the same. They parse more like quotes, so are not
5864
// listed here.
@@ -68,21 +74,38 @@ extension AST.Group.Kind: _ASTPrintable {
6874
}
6975
}
7076

77+
/// Whether this is a group with an implicit scope, e.g matching options
78+
/// written as (?i) implicitly become parent groups for the rest of the
79+
/// elements in the current group:
80+
///
81+
/// (a(?i)bc)de -> (a(?i:bc))de
82+
///
83+
public var hasImplicitScope: Bool {
84+
switch self {
85+
case .changeMatchingOptions(_, let hasImplicitScope):
86+
return hasImplicitScope
87+
default:
88+
return false
89+
}
90+
}
91+
7192
public var _dumpBase: String {
7293
switch self {
73-
case .capture: return "capture"
74-
case .namedCapture(let s): return "capture<\(s.value)>"
75-
case .nonCapture: return "nonCapture"
76-
case .nonCaptureReset: return "nonCaptureReset"
77-
case .atomicNonCapturing: return "atomicNonCapturing"
78-
case .lookahead: return "lookahead"
79-
case .negativeLookahead: return "negativeLookahead"
80-
case .nonAtomicLookahead: return "nonAtomicLookahead"
81-
case .lookbehind: return "lookbehind"
82-
case .negativeLookbehind: return "negativeLookbehind"
83-
case .nonAtomicLookbehind: return "nonAtomicLookbehind"
84-
case .scriptRun: return "scriptRun"
85-
case .atomicScriptRun: return "atomicScriptRun"
94+
case .capture: return "capture"
95+
case .namedCapture(let s): return "capture<\(s.value)>"
96+
case .nonCapture: return "nonCapture"
97+
case .nonCaptureReset: return "nonCaptureReset"
98+
case .atomicNonCapturing: return "atomicNonCapturing"
99+
case .lookahead: return "lookahead"
100+
case .negativeLookahead: return "negativeLookahead"
101+
case .nonAtomicLookahead: return "nonAtomicLookahead"
102+
case .lookbehind: return "lookbehind"
103+
case .negativeLookbehind: return "negativeLookbehind"
104+
case .nonAtomicLookbehind: return "nonAtomicLookbehind"
105+
case .scriptRun: return "scriptRun"
106+
case .atomicScriptRun: return "atomicScriptRun"
107+
case .changeMatchingOptions(let seq, let hasImplicitScope):
108+
return "changeMatchingOptions<\(seq), \(hasImplicitScope)>"
86109
}
87110
}
88111
}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
extension AST {
2+
/// An option written in source that changes matching semantics.
3+
public struct MatchingOption: Hashable {
4+
public enum Kind {
5+
// PCRE options
6+
case caseInsensitive // i
7+
case allowDuplicateGroupNames // J
8+
case multiline // m
9+
case noAutoCapture // n
10+
case singleLine // s
11+
case reluctantByDefault // U
12+
case extended // x
13+
case extraExtended // xx
14+
15+
// ICU options
16+
case unicodeWordBoundaries // w
17+
18+
// Oniguruma options
19+
case asciiOnlyDigit // D
20+
case asciiOnlyPOSIXProps // P
21+
case asciiOnlySpace // S
22+
case asciiOnlyWord // W
23+
24+
// Oniguruma text segment options (these are mutually exclusive and cannot
25+
// be unset, only flipped between)
26+
case textSegmentGraphemeMode // y{g}
27+
case textSegmentWordMode // y{w}
28+
}
29+
public var kind: Kind
30+
public var location: SourceLocation
31+
32+
public init(_ kind: Kind, location: SourceLocation) {
33+
self.kind = kind
34+
self.location = location
35+
}
36+
37+
public var isTextSegmentMode: Bool {
38+
switch kind {
39+
case .textSegmentGraphemeMode, .textSegmentWordMode:
40+
return true
41+
default:
42+
return false
43+
}
44+
}
45+
}
46+
47+
/// A sequence of matching options written in source.
48+
public struct MatchingOptionSequence: Hashable {
49+
/// If the sequence starts with a caret '^', its source location, or nil
50+
/// otherwise. If this is set, it indicates that all the matching options
51+
/// are unset, except the ones in `adding`.
52+
public var caretLoc: SourceLocation?
53+
54+
/// The options to add.
55+
public var adding: [MatchingOption]
56+
57+
/// The location of the '-' between the options to add and options to
58+
/// remove.
59+
public var minusLoc: SourceLocation?
60+
61+
/// The options to remove.
62+
public var removing: [MatchingOption]
63+
64+
public init(caretLoc: SourceLocation?, adding: [MatchingOption],
65+
minusLoc: SourceLocation?, removing: [MatchingOption]) {
66+
self.caretLoc = caretLoc
67+
self.adding = adding
68+
self.minusLoc = minusLoc
69+
self.removing = removing
70+
}
71+
}
72+
}
73+
74+
extension AST.MatchingOption: _ASTPrintable {
75+
public var _dumpBase: String { "\(kind)" }
76+
}
77+
78+
extension AST.MatchingOptionSequence: _ASTPrintable {
79+
public var _dumpBase: String {
80+
"adding: \(adding), removing: \(removing), hasCaret: \(caretLoc != nil)"
81+
}
82+
}

Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ enum ParseError: Error, Hashable {
2424

2525
case invalidPOSIXSetName(String)
2626
case emptyProperty
27+
28+
case expectedGroupSpecifier
29+
case cannotRemoveTextSegmentOptions
2730
}
2831

2932
extension ParseError: CustomStringConvertible {
@@ -57,6 +60,10 @@ extension ParseError: CustomStringConvertible {
5760
return "invalid character set name: '\(n)'"
5861
case .emptyProperty:
5962
return "empty property"
63+
case .expectedGroupSpecifier:
64+
return "expected group specifier"
65+
case .cannotRemoveTextSegmentOptions:
66+
return "text segment mode cannot be unset, only changed"
6067
}
6168
}
6269
}

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 117 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -456,11 +456,108 @@ extension Source {
456456
return AST.Trivia(trivia)
457457
}
458458

459+
/// Try to lex a matching option.
460+
///
461+
/// MatchingOption -> 'i' | 'J' | 'm' | 'n' | 's' | 'U' | 'x' | 'xx' | 'w'
462+
/// | 'D' | 'P' | 'S' | 'W' | 'y{' ('g' | 'w') '}'
463+
///
464+
mutating func lexMatchingOption() throws -> AST.MatchingOption? {
465+
typealias OptKind = AST.MatchingOption.Kind
466+
467+
let locOpt = try recordLoc { src -> OptKind? in
468+
func advanceAndReturn(_ o: OptKind) -> OptKind {
469+
src.advance()
470+
return o
471+
}
472+
guard let c = src.peek() else { return nil }
473+
switch c {
474+
// PCRE options.
475+
case "i": return advanceAndReturn(.caseInsensitive)
476+
case "J": return advanceAndReturn(.allowDuplicateGroupNames)
477+
case "m": return advanceAndReturn(.multiline)
478+
case "n": return advanceAndReturn(.noAutoCapture)
479+
case "s": return advanceAndReturn(.singleLine)
480+
case "U": return advanceAndReturn(.reluctantByDefault)
481+
case "x":
482+
src.advance()
483+
return src.tryEat("x") ? .extraExtended : .extended
484+
485+
// ICU options.
486+
case "w": return advanceAndReturn(.unicodeWordBoundaries)
487+
488+
// Oniguruma options.
489+
case "D": return advanceAndReturn(.asciiOnlyDigit)
490+
case "P": return advanceAndReturn(.asciiOnlyPOSIXProps)
491+
case "S": return advanceAndReturn(.asciiOnlySpace)
492+
case "W": return advanceAndReturn(.asciiOnlyWord)
493+
case "y":
494+
src.advance()
495+
try src.expect("{")
496+
let opt: OptKind
497+
if src.tryEat("w") {
498+
opt = .textSegmentWordMode
499+
} else {
500+
try src.expect("g")
501+
opt = .textSegmentGraphemeMode
502+
}
503+
try src.expect("}")
504+
return opt
505+
506+
default:
507+
return nil
508+
}
509+
}
510+
guard let locOpt = locOpt else { return nil }
511+
return .init(locOpt.value, location: locOpt.location)
512+
}
513+
514+
/// Try to lex a sequence of matching options.
515+
///
516+
/// MatchingOptionSeq -> '^' MatchingOption* | MatchingOption+
517+
/// | MatchingOption* '-' MatchingOption+
518+
///
519+
mutating func lexMatchingOptionSequence(
520+
) throws -> AST.MatchingOptionSequence? {
521+
let ateCaret = recordLoc { $0.tryEat("^") }
522+
523+
// TODO: Warn on duplicate options, and options appearing in both adding
524+
// and removing lists?
525+
var adding: [AST.MatchingOption] = []
526+
while let opt = try lexMatchingOption() {
527+
adding.append(opt)
528+
}
529+
530+
// If the sequence begun with a caret '^', options can be added, so we're
531+
// done.
532+
if ateCaret.value {
533+
return .init(caretLoc: ateCaret.location, adding: adding, minusLoc: nil,
534+
removing: [])
535+
}
536+
537+
// Try to lex options to remove.
538+
let ateMinus = recordLoc { $0.tryEat("-") }
539+
if ateMinus.value {
540+
var removing: [AST.MatchingOption] = []
541+
while let opt = try lexMatchingOption() {
542+
// Text segment options can only be added, they cannot be removed
543+
// with (?-), they should instead be set to a different mode.
544+
if opt.isTextSegmentMode {
545+
throw ParseError.cannotRemoveTextSegmentOptions
546+
}
547+
removing.append(opt)
548+
}
549+
return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location,
550+
removing: removing)
551+
}
552+
guard !adding.isEmpty else { return nil }
553+
return .init(caretLoc: nil, adding: adding, minusLoc: nil, removing: [])
554+
}
459555

460556
/// Try to consume the start of a group
461557
///
462558
/// GroupStart -> '(?' GroupKind | '('
463-
/// GroupKind -> Named | ':' | '|' | '>' | '=' | '!' | '<=' | '<!'
559+
/// GroupKind -> Named | ':' | '|' | '>' | '=' | '!' | '*' | '<=' | '<!'
560+
/// | '<*' | MatchingOptionSeq (':' | ')')
464561
/// Named -> '<' [^'>']+ '>' | 'P<' [^'>']+ '>'
465562
/// | '\'' [^'\'']+ '\''
466563
///
@@ -502,8 +599,25 @@ extension Source {
502599
return .namedCapture(name)
503600
}
504601

505-
throw ParseError.misc(
506-
"Unknown group kind '(?\(src.peek()!)'")
602+
// Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:).
603+
if let seq = try src.lexMatchingOptionSequence() {
604+
if src.tryEat(":") {
605+
return .changeMatchingOptions(seq, hasImplicitScope: false)
606+
}
607+
// If this isn't start of an explicit group, we should have an
608+
// implicit group that covers the remaining elements of the current
609+
// group.
610+
// TODO: This implicit scoping behavior matches Oniguruma, but PCRE
611+
// also does it across alternations, which will require additional
612+
// handling.
613+
try src.expect(")")
614+
return .changeMatchingOptions(seq, hasImplicitScope: true)
615+
}
616+
617+
guard let next = src.peek() else {
618+
throw ParseError.expectedGroupSpecifier
619+
}
620+
throw ParseError.misc("Unknown group kind '(?\(next)'")
507621
}
508622

509623
// Explicitly spelled out PRCE2 syntax for some groups.

Sources/_MatchingEngine/Regex/Parse/Parse.swift

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,10 @@ extension Parser {
171171
if let kind = try source.lexGroupStart() {
172172
priorGroupCount += 1
173173
let child = try parse()
174-
try source.expect(")")
174+
// An implicit scoped group has already consumed its closing paren.
175+
if !kind.value.hasImplicitScope {
176+
try source.expect(")")
177+
}
175178
return .group(.init(kind, child, loc(_start)))
176179
}
177180
if let cccStart = try source.lexCustomCCStart() {

Sources/_StringProcessing/ASTBuilder.swift

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,36 @@ public func scriptRun(_ child: AST) -> AST {
8989
public func atomicScriptRun(_ child: AST) -> AST {
9090
group(.atomicScriptRun, child)
9191
}
92+
func changeMatchingOptions(
93+
_ seq: AST.MatchingOptionSequence, hasImplicitScope: Bool, _ child: AST
94+
) -> AST {
95+
group(.changeMatchingOptions(seq, hasImplicitScope: hasImplicitScope), child)
96+
}
97+
98+
func matchingOptions(
99+
adding: [AST.MatchingOption.Kind] = [],
100+
removing: [AST.MatchingOption.Kind] = []
101+
) -> AST.MatchingOptionSequence {
102+
.init(caretLoc: nil, adding: adding.map { .init($0, location: .fake) },
103+
minusLoc: nil, removing: removing.map { .init($0, location: .fake)})
104+
}
105+
func matchingOptions(
106+
adding: AST.MatchingOption.Kind...,
107+
removing: AST.MatchingOption.Kind...
108+
) -> AST.MatchingOptionSequence {
109+
matchingOptions(adding: adding, removing: removing)
110+
}
111+
func unsetMatchingOptions(
112+
adding: [AST.MatchingOption.Kind]
113+
) -> AST.MatchingOptionSequence {
114+
.init(caretLoc: .fake, adding: adding.map { .init($0, location: .fake) },
115+
minusLoc: nil, removing: [])
116+
}
117+
func unsetMatchingOptions(
118+
adding: AST.MatchingOption.Kind...
119+
) -> AST.MatchingOptionSequence {
120+
unsetMatchingOptions(adding: adding)
121+
}
92122

93123
func quant(
94124
_ amount: AST.Quantification.Amount,

Tests/RegexTests/LexTests.swift

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,14 @@ extension RegexTests {
9393
diagnoseUniScalarOverflow("{123456789}", base: "u")
9494
diagnoseUniScalarOverflow("{123456789}", base: "x")
9595

96+
// Text segment options
97+
diagnose("(?-y{g})", expecting: .cannotRemoveTextSegmentOptions) {
98+
_ = try $0.lexGroupStart()
99+
}
100+
diagnose("(?-y{w})", expecting: .cannotRemoveTextSegmentOptions) {
101+
_ = try $0.lexGroupStart()
102+
}
103+
96104
// Test expected group.
97105
diagnose(#"(*"#, expecting: .misc("Quantifier '*' must follow operand")) {
98106
_ = try $0.lexGroupStart()
@@ -106,6 +114,27 @@ extension RegexTests {
106114
diagnose(#"\p{a"#, expecting: .expected("}")) { try $0.lexBasicAtom() }
107115
diagnose(#"\p{a="#, expecting: .expected("}")) { try $0.lexBasicAtom() }
108116
diagnose(#"(?#"#, expecting: .expected(")")) { _ = try $0.lexComment() }
117+
diagnose(#"(?x"#, expecting: .expected(")")) { _ = try $0.lexGroupStart() }
118+
119+
diagnose(#"(?"#, expecting: .expectedGroupSpecifier) {
120+
_ = try $0.lexGroupStart()
121+
}
122+
123+
diagnose(#"(?^"#, expecting: .expected(")")) { _ = try $0.lexGroupStart() }
124+
diagnose(#"(?^i"#, expecting: .expected(")")) { _ = try $0.lexGroupStart() }
125+
diagnose(#"(?^-"#, expecting: .expected(")")) { _ = try $0.lexGroupStart() }
126+
diagnose(#"(?^-)"#, expecting: .expected(")")) { _ = try $0.lexGroupStart() }
127+
diagnose(#"(?^i-"#, expecting: .expected(")")) { _ = try $0.lexGroupStart() }
128+
diagnose(#"(?^i-m)"#, expecting: .expected(")")) { _ = try $0.lexGroupStart() }
129+
130+
diagnose(#"(?y)"#, expecting: .expected("{")) { _ = try $0.lexGroupStart() }
131+
diagnose(#"(?y{)"#, expecting: .expected("g")) { _ = try $0.lexGroupStart() }
132+
diagnose(#"(?y{g)"#, expecting: .expected("}")) { _ = try $0.lexGroupStart() }
133+
diagnose(#"(?y{x})"#, expecting: .expected("g")) { _ = try $0.lexGroupStart() }
134+
135+
diagnose(#"(?k)"#, expecting: .misc("Unknown group kind '(?k'")) {
136+
_ = try $0.lexGroupStart()
137+
}
109138

110139
// TODO: want to dummy print out source ranges, etc, test that.
111140
}

0 commit comments

Comments
 (0)