Skip to content

Commit 9a06a3e

Browse files
committed
Generalized bidirectional assertion support
Adds generalized assertion support as well support for most built-in assertions and anchors.
1 parent 0e25188 commit 9a06a3e

File tree

12 files changed

+301
-6
lines changed

12 files changed

+301
-6
lines changed

Sources/_MatchingEngine/Engine/Builder.swift

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ extension Program where Input.Element: Hashable {
66
var sequences = TypedSetVector<[Input.Element], _SequenceRegister>()
77
var strings = TypedSetVector<String, _StringRegister>()
88
var consumeFunctions: [ConsumeFunction] = []
9+
var assertionFunctions: [AssertionFunction] = []
910

1011
// Map tokens to actual addresses
1112
var addressTokens: [InstructionAddress?] = []
@@ -173,6 +174,13 @@ extension Program.Builder {
173174
.consumeBy, .init(consumer: makeConsumeFunction(p))))
174175
}
175176

177+
public mutating func buildAssert(
178+
by p: @escaping Program.AssertionFunction
179+
) {
180+
instructions.append(.init(
181+
.assertBy, .init(assertion: makeAssertionFunction(p))))
182+
}
183+
176184
public mutating func buildAssert(
177185
_ e: Input.Element, into cond: BoolRegister
178186
) {
@@ -243,13 +251,15 @@ extension Program.Builder {
243251
regInfo.ints = nextIntRegister.rawValue
244252
regInfo.positions = nextPositionRegister.rawValue
245253
regInfo.consumeFunctions = consumeFunctions.count
254+
regInfo.assertionFunctions = assertionFunctions.count
246255

247256
return Program(
248257
instructions: InstructionList(instructions),
249258
staticElements: elements.stored,
250259
staticSequences: sequences.stored,
251260
staticStrings: strings.stored,
252261
staticConsumeFunctions: consumeFunctions,
262+
staticAssertionFunctions: assertionFunctions,
253263
registerInfo: regInfo)
254264
}
255265

@@ -377,5 +387,11 @@ extension Program.Builder {
377387
defer { consumeFunctions.append(f) }
378388
return ConsumeFunctionRegister(consumeFunctions.count)
379389
}
390+
public mutating func makeAssertionFunction(
391+
_ f: @escaping Program.AssertionFunction
392+
) -> AssertionFunctionRegister {
393+
defer { assertionFunctions.append(f) }
394+
return AssertionFunctionRegister(assertionFunctions.count)
395+
}
380396
}
381397

Sources/_MatchingEngine/Engine/Engine.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// Currently, engine binds the type and consume binds an instance.
22
// But, we can play around with this.
3-
public struct Engine<Input: Collection> where Input.Element: Hashable {
3+
public struct Engine<Input: BidirectionalCollection> where Input.Element: Hashable {
44

55
var program: Program<Input>
66

Sources/_MatchingEngine/Engine/InstPayload.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ extension Instruction.Payload {
3939
case bool(BoolRegister)
4040
case element(ElementRegister)
4141
case consumer(ConsumeFunctionRegister)
42+
case assertion(AssertionFunctionRegister)
4243
case addr(InstructionAddress)
4344

4445
case packedImmInt(Int, IntRegister)
@@ -186,6 +187,13 @@ extension Instruction.Payload {
186187
interpret()
187188
}
188189

190+
init(assertion: AssertionFunctionRegister) {
191+
self.init(assertion)
192+
}
193+
var assertion: AssertionFunctionRegister {
194+
interpret()
195+
}
196+
189197
init(addr: InstructionAddress) {
190198
self.init(addr)
191199
}

Sources/_MatchingEngine/Engine/Instruction.swift

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -162,11 +162,18 @@ extension Instruction {
162162
/// Operand: Consume function register to call.
163163
case consumeBy
164164

165-
/// Custom assertion operation
165+
/// Custom lookaround assertion operation.
166+
/// Triggers a failure if customFunction returns false.
167+
///
168+
/// assert(
169+
/// _ customFunction: (
170+
/// input: Input,
171+
/// currentPos: Position,
172+
/// bounds: Range<Position>) -> Bool
173+
/// )
166174
///
167175
/// Operands: destination bool register, assert hook register
168-
static var assertHook: OpCode { fatalError() }
169-
176+
case assertBy
170177

171178
// MARK: Matching: Save points
172179

Sources/_MatchingEngine/Engine/Processor.swift

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ struct Controller {
1414
}
1515

1616
struct Processor<
17-
Input: Collection
17+
Input: BidirectionalCollection
1818
> where Input.Element: Equatable { // maybe Hashable?
1919
typealias Element = Input.Element
2020

@@ -347,6 +347,15 @@ extension Processor {
347347
advance(to: nextIndex)
348348
controller.step()
349349

350+
case .assertBy:
351+
let reg = payload.assertion
352+
let assertion = registers[reg]
353+
guard assertion(input, currentPosition, bounds) else {
354+
signalFailure()
355+
return
356+
}
357+
controller.step()
358+
350359
case .print:
351360
// TODO: Debug stream
352361
doPrint(registers[payload.string])

Sources/_MatchingEngine/Engine/Program.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
public struct Program<Input: Collection> where Input.Element: Equatable {
22
public typealias ConsumeFunction = (Input, Range<Input.Index>) -> Input.Index?
3+
public typealias AssertionFunction =
4+
(Input, Input.Index, Range<Input.Index>) -> Bool
35
var instructions: InstructionList<Instruction>
46

57
var staticElements: [Input.Element]
68
var staticSequences: [[Input.Element]]
79
var staticStrings: [String]
810
var staticConsumeFunctions: [ConsumeFunction]
11+
var staticAssertionFunctions: [AssertionFunction]
912

1013
var registerInfo: RegisterInfo
1114

Sources/_MatchingEngine/Engine/Registers.swift

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ extension Processor {
1515
// currently, these are static readonly
1616
var consumeFunctions: [Program<Input>.ConsumeFunction]
1717

18+
// currently, these are static readonly
19+
var assertionFunctions: [Program<Input>.AssertionFunction]
20+
1821
// currently, these are for comments and abort messages
1922
var strings: [String]
2023

@@ -63,6 +66,9 @@ extension Processor {
6366
subscript(_ i: ConsumeFunctionRegister) -> Program<Input>.ConsumeFunction {
6467
consumeFunctions[i.rawValue]
6568
}
69+
subscript(_ i: AssertionFunctionRegister) -> Program<Input>.AssertionFunction {
70+
assertionFunctions[i.rawValue]
71+
}
6672
}
6773
}
6874

@@ -82,6 +88,9 @@ extension Processor.Registers {
8288
self.consumeFunctions = program.staticConsumeFunctions
8389
assert(consumeFunctions.count == info.consumeFunctions)
8490

91+
self.assertionFunctions = program.staticAssertionFunctions
92+
assert(assertionFunctions.count == info.assertionFunctions)
93+
8594
self.strings = program.staticStrings
8695
assert(strings.count == info.strings)
8796

@@ -110,6 +119,7 @@ extension Program {
110119
var bools = 0
111120
var strings = 0
112121
var consumeFunctions = 0
122+
var assertionFunctions = 0
113123
var ints = 0
114124
var floats = 0
115125
var positions = 0

Sources/_MatchingEngine/Regex/AST/Atom.swift

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,3 +441,65 @@ extension AST.Atom {
441441
}
442442
}
443443
}
444+
445+
extension AST.Atom {
446+
/// Anchors and other built-in zero-width assertions
447+
public enum AssertionKind: String {
448+
/// \A
449+
case startOfSubject = #"\A"#
450+
451+
/// \Z
452+
case endOfSubjectBeforeNewline = #"\Z"#
453+
454+
/// \z
455+
case endOfSubject = #"\z"#
456+
457+
/// \K
458+
case resetStartOfMatch = #"\K"#
459+
460+
/// \G
461+
case firstMatchingPositionInSubject = #"\G"#
462+
463+
/// \y
464+
case textSegment = #"\y"#
465+
466+
/// \Y
467+
case notTextSegment = #"\Y"#
468+
469+
/// ^
470+
case startOfLine = #"^"#
471+
472+
/// $
473+
case endOfLine = #"$"#
474+
475+
/// \b (from outside a custom character class)
476+
case wordBoundary = #"\b"#
477+
478+
/// \B
479+
case notWordBoundary = #"\B"#
480+
481+
}
482+
483+
public var assertionKind: AssertionKind? {
484+
switch kind {
485+
case .startOfLine: return .startOfLine
486+
case .endOfLine: return .endOfLine
487+
488+
case .escaped(.wordBoundary): return .wordBoundary
489+
case .escaped(.notWordBoundary): return .notWordBoundary
490+
case .escaped(.startOfSubject): return .startOfSubject
491+
case .escaped(.endOfSubject): return .endOfSubject
492+
case .escaped(.textSegment): return .textSegment
493+
case .escaped(.notTextSegment): return .notTextSegment
494+
case .escaped(.endOfSubjectBeforeNewline):
495+
return .endOfSubjectBeforeNewline
496+
case .escaped(.firstMatchingPositionInSubject):
497+
return .firstMatchingPositionInSubject
498+
499+
case .escaped(.resetStartOfMatch): return .resetStartOfMatch
500+
501+
default: return nil
502+
}
503+
}
504+
}
505+

Sources/_MatchingEngine/Utility/TypedInt.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,10 @@ public enum _StringRegister {}
144144
public typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister>
145145
public enum _ConsumeFunctionRegister {}
146146

147+
/// Used for assertion functions, e.g. anchors etc
148+
public typealias AssertionFunctionRegister = TypedInt<_AssertionFunctionRegister>
149+
public enum _AssertionFunctionRegister {}
150+
147151
/// UNIMPLEMENTED
148152
public typealias IntRegister = TypedInt<_IntRegister>
149153
public enum _IntRegister {}

Sources/_StringProcessing/CharacterClass.swift

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,3 +421,31 @@ extension AST.CustomCharacterClass {
421421
return self.isInverted ? cc.inverted : cc
422422
}
423423
}
424+
425+
extension CharacterClass {
426+
// FIXME: Calling on inverted sets wont be the same as the
427+
// inverse of a boundary if at the start or end of the
428+
// string. (Think through what we want: do it ourselves or
429+
// give the caller both options).
430+
func isBoundary(
431+
_ input: String,
432+
at pos: String.Index,
433+
bounds: Range<String.Index>
434+
) -> Bool {
435+
// FIXME: How should we handle bounds?
436+
// We probably need two concepts
437+
if input.isEmpty { return false }
438+
if pos == input.startIndex {
439+
return self.matches(in: input, at: pos) != nil
440+
}
441+
let priorIdx = input.index(before: pos)
442+
if pos == input.endIndex {
443+
return self.matches(in: input, at: priorIdx) != nil
444+
}
445+
446+
let prior = self.matches(in: input, at: priorIdx) != nil
447+
let current = self.matches(in: input, at: pos) != nil
448+
return prior != current
449+
}
450+
451+
}

Sources/_StringProcessing/Compiler.swift

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,11 +108,86 @@ class Compiler {
108108
// We stick quoted content into read-only constant strings
109109
builder.buildMatchSequence(q.literal)
110110

111+
case .atom(let a) where a.assertionKind != nil:
112+
try emitAssertion(a.assertionKind!)
113+
111114
case .customCharacterClass, .atom:
112115
throw unsupported(node._dumpBase)
113116
}
114117
}
115118

119+
func emitAssertion(_ kind: AST.Atom.AssertionKind) throws {
120+
// FIXME: Depends on API model we have... We may want to
121+
// think through some of these with API interactions in mind
122+
//
123+
// This might break how we use `bounds` for both slicing
124+
// and things like `firstIndex`, that is `firstIndex` may
125+
// need to supply both a slice bounds and a per-search bounds.
126+
switch kind {
127+
case .startOfSubject:
128+
builder.buildAssert { (input, pos, bounds) in
129+
pos == input.startIndex
130+
}
131+
132+
case .endOfSubjectBeforeNewline:
133+
builder.buildAssert { (input, pos, bounds) in
134+
if pos == input.endIndex { return true }
135+
return input.index(after: pos) == input.endIndex
136+
&& input[pos].isNewline
137+
}
138+
139+
case .endOfSubject:
140+
builder.buildAssert { (input, pos, bounds) in
141+
pos == input.endIndex
142+
}
143+
144+
case .resetStartOfMatch:
145+
// FIXME: Figure out how to communicate this out
146+
throw unsupported(#"\K (reset/keep assertion)"#)
147+
148+
case .firstMatchingPositionInSubject:
149+
// TODO: We can probably build a nice model with API here
150+
builder.buildAssert { (input, pos, bounds) in
151+
pos == bounds.lowerBound
152+
}
153+
154+
case .textSegment:
155+
// This we should be able to do!
156+
throw unsupported(#"\y (text segment)"#)
157+
158+
case .notTextSegment:
159+
// This we should be able to do!
160+
throw unsupported(#"\Y (not text segment)"#)
161+
162+
case .startOfLine:
163+
builder.buildAssert { (input, pos, bounds) in
164+
pos == input.startIndex ||
165+
input[input.index(before: pos)].isNewline
166+
}
167+
168+
case .endOfLine:
169+
builder.buildAssert { (input, pos, bounds) in
170+
pos == input.endIndex || input[pos].isNewline
171+
}
172+
173+
case .wordBoundary:
174+
// TODO: May want to consider Unicode level
175+
builder.buildAssert { (input, pos, bounds) in
176+
// TODO: How should we handle bounds?
177+
CharacterClass.word.isBoundary(
178+
input, at: pos, bounds: bounds)
179+
}
180+
181+
case .notWordBoundary:
182+
// TODO: May want to consider Unicode level
183+
builder.buildAssert { (input, pos, bounds) in
184+
// TODO: How should we handle bounds?
185+
!CharacterClass.word.isBoundary(
186+
input, at: pos, bounds: bounds)
187+
}
188+
}
189+
}
190+
116191
func emitLookaround(
117192
_ kind: (forwards: Bool, positive: Bool),
118193
_ child: AST

0 commit comments

Comments
 (0)