Skip to content

Commit a6132a5

Browse files
authored
Add options support to the compiler (#112)
MatchingOptions provides an interface for the compiler to manage group-scoped matching options, to apply matching option sequences from the AST, and to query when building out matching behavior. Includes support and tests for the `s` and `u` option flags.
1 parent 074ee9f commit a6132a5

File tree

12 files changed

+455
-85
lines changed

12 files changed

+455
-85
lines changed

Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ extension AST {
3636
// be unset, only flipped between)
3737
case textSegmentGraphemeMode // y{g}
3838
case textSegmentWordMode // y{w}
39+
40+
// Swift semantic matching level
41+
case graphemeClusterSemantics // X
42+
case unicodeScalarSemantics // u
43+
case byteSemantics // b
3944
}
4045
public var kind: Kind
4146
public var location: SourceLocation
@@ -53,6 +58,15 @@ extension AST {
5358
return false
5459
}
5560
}
61+
62+
public var isSemanticMatchingLevel: Bool {
63+
switch kind {
64+
case .graphemeClusterSemantics, .unicodeScalarSemantics, .byteSemantics:
65+
return true
66+
default:
67+
return false
68+
}
69+
}
5670
}
5771

5872
/// A sequence of matching options written in source.

Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ enum ParseError: Error, Hashable {
6565
case identifierCannotStartWithNumber(IdentifierKind)
6666

6767
case cannotRemoveTextSegmentOptions
68+
case cannotRemoveSemanticsOptions
6869
case expectedCalloutArgument
6970
}
7071

@@ -145,6 +146,8 @@ extension ParseError: CustomStringConvertible {
145146
return "\(i.diagDescription) must not start with number"
146147
case .cannotRemoveTextSegmentOptions:
147148
return "text segment mode cannot be unset, only changed"
149+
case .cannotRemoveSemanticsOptions:
150+
return "semantic level cannot be unset, only changed"
148151
case .expectedCalloutArgument:
149152
return "expected argument to callout"
150153
}

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,11 @@ extension Source {
574574
try src.expect("}")
575575
return opt
576576

577+
// Swift semantic level options
578+
case "X": return advanceAndReturn(.graphemeClusterSemantics)
579+
case "u": return advanceAndReturn(.unicodeScalarSemantics)
580+
case "b": return advanceAndReturn(.byteSemantics)
581+
577582
default:
578583
return nil
579584
}
@@ -618,6 +623,10 @@ extension Source {
618623
if opt.isTextSegmentMode {
619624
throw ParseError.cannotRemoveTextSegmentOptions
620625
}
626+
// Matching semantics options can only be added, not removed.
627+
if opt.isSemanticMatchingLevel {
628+
throw ParseError.cannotRemoveSemanticsOptions
629+
}
621630
removing.append(opt)
622631
}
623632
return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location,

Sources/_StringProcessing/CharacterClass.swift

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -338,12 +338,19 @@ extension AST.Atom {
338338
switch kind {
339339
case let .escaped(b): return b.characterClass
340340

341-
case .any: return .any
342-
343341
case .property:
344342
// TODO: Would our model type for character classes include
345343
// this? Or does grapheme-semantic mode complicate that?
346344
return nil
345+
346+
case .any:
347+
// `.any` is handled in the matching engine by Compiler.emitAny() and in
348+
// the legacy compiler by the `.any` instruction, which can provide lower
349+
// level instructions than the CharacterClass-generated consumer closure
350+
//
351+
// FIXME: We shouldn't be returning `nil` here, but instead fixing the call
352+
// site to check for any before trying to construct a character class.
353+
return nil
347354

348355
default: return nil
349356

Sources/_StringProcessing/Compiler.swift

Lines changed: 51 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,13 @@ struct RegexProgram {
1818

1919
class Compiler {
2020
let ast: AST
21-
let matchLevel: CharacterClass.MatchLevel
22-
let options: REOptions
21+
private var options = MatchingOptions()
2322
private var builder = RegexProgram.Program.Builder()
2423

2524
init(
26-
ast: AST,
27-
matchLevel: CharacterClass.MatchLevel = .graphemeCluster,
28-
options: REOptions = []
25+
ast: AST
2926
) {
3027
self.ast = ast
31-
self.matchLevel = matchLevel
32-
self.options = options
3328
}
3429

3530
__consuming func emit() throws -> RegexProgram {
@@ -42,11 +37,9 @@ class Compiler {
4237
func emit(_ node: AST.Node) throws {
4338

4439
switch node {
45-
// Any: .
46-
// consume 1
47-
case .atom(let a) where a.kind == .any && matchLevel == .graphemeCluster:
48-
builder.buildAdvance(1)
49-
40+
case .atom(let a) where a.kind == .any:
41+
try emitAny()
42+
5043
// Single characters we just match
5144
case .atom(let a) where a.singleCharacter != nil :
5245
builder.buildMatch(a.singleCharacter!)
@@ -97,6 +90,9 @@ class Compiler {
9790
throw unsupported(node.renderAsCanonical())
9891

9992
case .group(let g):
93+
options.beginScope()
94+
defer { options.endScope() }
95+
10096
if let lookaround = g.lookaroundKind {
10197
try emitLookaround(lookaround, g.child)
10298
return
@@ -113,6 +109,10 @@ class Compiler {
113109
try emit(g.child)
114110
builder.buildEndCapture(cap)
115111

112+
case .changeMatchingOptions(let optionSequence, _):
113+
options.apply(optionSequence)
114+
try emit(g.child)
115+
116116
default:
117117
// FIXME: Other kinds...
118118
try emit(g.child)
@@ -124,8 +124,8 @@ class Compiler {
124124
// For now, we model sets and atoms as consumers.
125125
// This lets us rapidly expand support, and we can better
126126
// design the actual instruction set with real examples
127-
case _ where try node.generateConsumer(matchLevel) != nil:
128-
try builder.buildConsume(by: node.generateConsumer(matchLevel)!)
127+
case _ where try node.generateConsumer(options) != nil:
128+
try builder.buildConsume(by: node.generateConsumer(options)!)
129129

130130
case .quote(let q):
131131
// We stick quoted content into read-only constant strings
@@ -158,6 +158,31 @@ class Compiler {
158158
throw unsupported(node.renderAsCanonical())
159159
}
160160
}
161+
162+
func emitAny() throws {
163+
switch (options.semanticLevel, options.dotMatchesNewline) {
164+
case (.graphemeCluster, true):
165+
builder.buildAdvance(1)
166+
case (.graphemeCluster, false):
167+
builder.buildConsume { input, bounds in
168+
input[bounds.lowerBound].isNewline
169+
? nil
170+
: input.index(after: bounds.lowerBound)
171+
}
172+
173+
case (.unicodeScalar, true):
174+
// TODO: builder.buildAdvanceUnicodeScalar(1)
175+
builder.buildConsume { input, bounds in
176+
input.unicodeScalars.index(after: bounds.lowerBound)
177+
}
178+
case (.unicodeScalar, false):
179+
builder.buildConsume { input, bounds in
180+
input[bounds.lowerBound].isNewline
181+
? nil
182+
: input.unicodeScalars.index(after: bounds.lowerBound)
183+
}
184+
}
185+
}
161186

162187
func emitAssertion(_ kind: AST.Atom.AssertionKind) throws {
163188
// FIXME: Depends on API model we have... We may want to
@@ -458,7 +483,18 @@ class Compiler {
458483

459484
func emitQuantification(_ quant: AST.Quantification) throws {
460485
let child = quant.child
461-
let kind = quant.kind.value
486+
487+
// If in reluctant-by-default mode, eager and reluctant need to be switched.
488+
let kind: AST.Quantification.Kind
489+
if options.isReluctantByDefault
490+
&& quant.kind.value != .possessive
491+
{
492+
kind = quant.kind.value == .eager
493+
? .reluctant
494+
: .eager
495+
} else {
496+
kind = quant.kind.value
497+
}
462498

463499
switch quant.amount.value.bounds {
464500
case (_, atMost: 0):

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,8 @@ func unsupported(
2828
file: StaticString = #file,
2929
line: UInt = #line
3030
) -> Unsupported {
31-
// TODO: how do we not have a public init for this?
32-
let fStr = file.withUTF8Buffer {
33-
String(decoding: $0, as: UTF8.self)
34-
}
3531
return Unsupported(
36-
message: s, file: fStr, line: Int(line))
32+
message: s, file: String(describing: file), line: Int(line))
3733
}
3834

3935
extension AST.Node {
@@ -42,8 +38,7 @@ extension AST.Node {
4238
/// A consumer is a Swift closure that matches against
4339
/// the front of an input range
4440
func generateConsumer(
45-
// TODO: Better option modeling
46-
_ opts: CharacterClass.MatchLevel
41+
_ opts: MatchingOptions
4742
) throws -> Program<String>.ConsumeFunction? {
4843
switch self {
4944
case .atom(let a):
@@ -77,10 +72,10 @@ extension AST.Atom {
7772
}
7873

7974
func generateConsumer(
80-
_ opts: CharacterClass.MatchLevel
75+
_ opts: MatchingOptions
8176
) throws -> Program<String>.ConsumeFunction? {
8277
// TODO: Wean ourselves off of this type...
83-
if let cc = self.characterClass?.withMatchLevel(opts) {
78+
if let cc = self.characterClass?.withMatchLevel(opts.matchLevel) {
8479
return { input, bounds in
8580
// FIXME: should we worry about out of bounds?
8681
cc.matches(in: input, at: bounds.lowerBound)
@@ -109,10 +104,16 @@ extension AST.Atom {
109104
// TODO: alias? casing?
110105
$0.name == name || $0.nameAlias == name
111106
}
107+
108+
case .any:
109+
fatalError(".atom(.any) is handled in emitAny")
112110

111+
case .startOfLine, .endOfLine:
112+
// handled in emitAssertion
113+
return nil
114+
113115
case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl,
114-
.any, .startOfLine, .endOfLine,
115-
.backreference, .subpattern, .callout, .backtrackingDirective:
116+
.backreference, .subpattern, .callout, .backtrackingDirective:
116117
// FIXME: implement
117118
return nil
118119
}
@@ -121,7 +122,7 @@ extension AST.Atom {
121122

122123
extension AST.CustomCharacterClass.Member {
123124
func generateConsumer(
124-
_ opts: CharacterClass.MatchLevel
125+
_ opts: MatchingOptions
125126
) throws -> Program<String>.ConsumeFunction {
126127
switch self {
127128
case .custom(let ccc):
@@ -212,7 +213,7 @@ extension AST.CustomCharacterClass.Member {
212213

213214
extension AST.CustomCharacterClass {
214215
func generateConsumer(
215-
_ opts: CharacterClass.MatchLevel
216+
_ opts: MatchingOptions
216217
) throws -> Program<String>.ConsumeFunction {
217218
// NOTE: Easy way to implement, obviously not performant
218219
let consumers = try members.map {
@@ -265,7 +266,7 @@ private func consumeScalar(
265266

266267
extension AST.Atom.CharacterProperty {
267268
func generateConsumer(
268-
_ opts: CharacterClass.MatchLevel
269+
_ opts: MatchingOptions
269270
) throws -> Program<String>.ConsumeFunction {
270271
// Handle inversion for us, albeit not efficiently
271272
func invert(
@@ -335,7 +336,7 @@ extension AST.Atom.CharacterProperty {
335336
extension Unicode.BinaryProperty {
336337
// FIXME: Semantic level, vet for precise defs
337338
func generateConsumer(
338-
_ opts: CharacterClass.MatchLevel
339+
_ opts: MatchingOptions
339340
) throws -> Program<String>.ConsumeFunction {
340341
switch self {
341342

@@ -499,7 +500,7 @@ extension Unicode.BinaryProperty {
499500
extension Unicode.POSIXProperty {
500501
// FIXME: Semantic level, vet for precise defs
501502
func generateConsumer(
502-
_ opts: CharacterClass.MatchLevel
503+
_ opts: MatchingOptions
503504
) -> Program<String>.ConsumeFunction {
504505
// FIXME: semantic levels, modes, etc
505506
switch self {
@@ -545,7 +546,7 @@ extension Unicode.POSIXProperty {
545546
extension Unicode.ExtendedGeneralCategory {
546547
// FIXME: Semantic level
547548
func generateConsumer(
548-
_ opts: CharacterClass.MatchLevel
549+
_ opts: MatchingOptions
549550
) throws -> Program<String>.ConsumeFunction {
550551
switch self {
551552
case .letter:

0 commit comments

Comments
 (0)