Skip to content

Commit c4ec0b0

Browse files
authored
MatchingEngine support for (flat) captures with full history (#105)
Capture and backreference support in backend and compiler
1 parent e46ee1e commit c4ec0b0

File tree

19 files changed

+1022
-516
lines changed

19 files changed

+1022
-516
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
extension Processor {
13+
14+
// TODO: What all do we want to save? Configurable?
15+
// TODO: Do we need to save any registers?
16+
// TODO: Is this the right place to do function stack unwinding?
17+
struct SavePoint {
18+
var pc: InstructionAddress
19+
var pos: Position?
20+
21+
// The end of the call stack, so we can slice it off
22+
// when failing inside a call.
23+
//
24+
// NOTE: Alternatively, also place return addresses on the
25+
// save point stack
26+
var stackEnd: CallStackAddress
27+
28+
// FIXME: Save minimal info (e.g. stack position and
29+
// perhaps current start)
30+
var captureEnds: [_StoredCapture]
31+
32+
var destructure: (
33+
pc: InstructionAddress,
34+
pos: Position?,
35+
stackEnd: CallStackAddress,
36+
captureEnds: [_StoredCapture]
37+
) {
38+
(pc, pos, stackEnd, captureEnds)
39+
}
40+
}
41+
42+
func makeSavePoint(
43+
_ pc: InstructionAddress,
44+
addressOnly: Bool = false
45+
) -> SavePoint {
46+
SavePoint(
47+
pc: pc,
48+
pos: addressOnly ? nil : currentPosition,
49+
stackEnd: .init(callStack.count),
50+
captureEnds: storedCaptures)
51+
}
52+
}
53+
54+

Sources/_MatchingEngine/Engine/Builder.swift

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ extension Program where Input.Element: Hashable {
2727
var nextBoolRegister = BoolRegister(0)
2828
var nextIntRegister = IntRegister(0)
2929
var nextPositionRegister = PositionRegister(0)
30+
var nextCaptureRegister = CaptureRegister(0)
3031

3132
// Special addresses or instructions
3233
var failAddressToken: AddressToken? = nil
@@ -81,8 +82,7 @@ extension Program.Builder {
8182
public mutating func buildMoveImmediate(
8283
_ value: Int, into: IntRegister
8384
) {
84-
let uint = UInt64(truncatingIfNeeded: value)
85-
assert(uint == value)
85+
let uint = UInt64(asserting: value)
8686
buildMoveImmediate(uint, into: into)
8787
}
8888

@@ -207,6 +207,27 @@ extension Program.Builder {
207207
instructions.append(.init(.print, .init(string: s)))
208208
}
209209

210+
public mutating func buildBeginCapture(
211+
_ cap: CaptureRegister
212+
) {
213+
instructions.append(
214+
.init(.beginCapture, .init(capture: cap)))
215+
}
216+
217+
public mutating func buildEndCapture(
218+
_ cap: CaptureRegister
219+
) {
220+
instructions.append(
221+
.init(.endCapture, .init(capture: cap)))
222+
}
223+
224+
public mutating func buildBackreference(
225+
_ cap: CaptureRegister
226+
) {
227+
instructions.append(
228+
.init(.backreference, .init(capture: cap)))
229+
}
230+
210231
// TODO: Mutating because of fail address fixup, drop when
211232
// that's removed
212233
public mutating func assemble() -> Program {
@@ -263,6 +284,7 @@ extension Program.Builder {
263284
regInfo.positions = nextPositionRegister.rawValue
264285
regInfo.consumeFunctions = consumeFunctions.count
265286
regInfo.assertionFunctions = assertionFunctions.count
287+
regInfo.captures = nextCaptureRegister.rawValue
266288

267289
return Program(
268290
instructions: InstructionList(instructions),
@@ -341,6 +363,11 @@ extension Program.Builder {
341363

342364
// Register helpers
343365
extension Program.Builder {
366+
public mutating func makeCapture() -> CaptureRegister {
367+
defer { nextCaptureRegister.rawValue += 1 }
368+
return nextCaptureRegister
369+
}
370+
344371
public mutating func makeBoolRegister() -> BoolRegister {
345372
defer { nextBoolRegister.rawValue += 1 }
346373
return nextBoolRegister
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
/*
13+
14+
TODO: Specialized data structure for all captures:
15+
16+
- We want to be able to refer to COW prefixes for which
17+
simple appends do not invalidate
18+
- We want a compact save-point representation
19+
20+
TODO: Conjectures:
21+
22+
- We should be able to remove the entire capture history,
23+
lazily recomputing it on-request from the initial stored
24+
save point
25+
- We should be able to keep these flat and simple, lazily
26+
constructing structured types on-request
27+
28+
*/
29+
30+
31+
extension Processor {
32+
struct _StoredCapture {
33+
// Set whenever we push the very first capture, allows us
34+
// to theoretically re-compute anything we want to later.
35+
fileprivate var startState: SavePoint? = nil
36+
37+
// Save the entire history as we go, so that backtracking
38+
// can just lop-off aborted runs.
39+
//
40+
// Backtracking entries can specify a per-capture stack
41+
// index so that we can abort anything that came after.
42+
//
43+
// By remembering the entire history, we waste space, but
44+
// we get flexibility for now.
45+
//
46+
fileprivate var stack: Array<Range<Position>> = []
47+
48+
// An in-progress capture start
49+
fileprivate var currentCaptureBegin: Position? = nil
50+
51+
fileprivate func _invariantCheck() {
52+
if startState == nil {
53+
assert(stack.isEmpty)
54+
assert(currentCaptureBegin == nil)
55+
} else {
56+
assert(!stack.isEmpty || currentCaptureBegin != nil)
57+
}
58+
}
59+
60+
// MARK: - IPI
61+
62+
var isEmpty: Bool { stack.isEmpty }
63+
64+
var history: Array<Range<Position>> {
65+
stack
66+
}
67+
68+
var latest: Range<Position>? { stack.last }
69+
70+
/// Start a new capture. If the previously started one was un-ended,
71+
/// will clear it and restart. If this is the first start, will save `initial`.
72+
mutating func startCapture(
73+
_ idx: Position, initial: SavePoint
74+
) {
75+
_invariantCheck()
76+
defer { _invariantCheck() }
77+
78+
if self.startState == nil {
79+
self.startState = initial
80+
}
81+
currentCaptureBegin = idx
82+
}
83+
84+
mutating func endCapture(_ idx: Position) {
85+
_invariantCheck()
86+
assert(currentCaptureBegin != nil)
87+
defer { _invariantCheck() }
88+
89+
stack.append(currentCaptureBegin! ..< idx)
90+
}
91+
92+
mutating func fail(truncatingAt stackIdx: Int) {
93+
_invariantCheck()
94+
assert(stackIdx <= stack.endIndex)
95+
defer { _invariantCheck() }
96+
97+
stack.removeSubrange(stackIdx...)
98+
if stack.isEmpty {
99+
startState = nil
100+
}
101+
}
102+
}
103+
}
104+
105+
public struct CaptureList {
106+
var caps: Array<Array<Range<String.Index>>>
107+
108+
func extract(from s: String) -> Array<Array<Substring>> {
109+
caps.map { $0.map { s[$0] } }
110+
}
111+
112+
func latest(from s: String) -> Array<Substring?> {
113+
// TODO: If empty, probably need empty range or something...
114+
extract(from: s).map { $0.last }
115+
}
116+
}

Sources/_MatchingEngine/Engine/Consume.swift

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,20 @@ extension Engine {
2222
matchMode: matchMode,
2323
isTracingEnabled: enableTracing)
2424
}
25+
}
2526

26-
public func consume(_ input: Input) -> Input.Index? {
27+
extension Engine where Input == String {
28+
public func consume(
29+
_ input: Input
30+
) -> (Input.Index, CaptureList)? {
2731
consume(input, in: input.startIndex ..< input.endIndex)
2832
}
2933

3034
public func consume(
3135
_ input: Input,
3236
in range: Range<Input.Index>,
3337
matchMode: MatchMode = .prefix
34-
) -> Input.Index? {
38+
) -> (Input.Index, CaptureList)? {
3539
if enableTracing {
3640
print("Consume: \(input)")
3741
}
@@ -56,7 +60,10 @@ extension Engine {
5660
print("Result: nil")
5761
}
5862
}
59-
return result
63+
guard let result = result else { return nil }
64+
65+
let capList = cpu.storedCaptures.map { $0.history }
66+
return (result, CaptureList(caps: capList))
6067
}
6168
}
6269

Sources/_MatchingEngine/Engine/InstPayload.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ extension Instruction.Payload {
5252
case consumer(ConsumeFunctionRegister)
5353
case assertion(AssertionFunctionRegister)
5454
case addr(InstructionAddress)
55+
case capture(CaptureRegister)
5556

5657
case packedImmInt(Int, IntRegister)
5758
case packedAddrBool(InstructionAddress, BoolRegister)
@@ -121,6 +122,7 @@ extension Instruction.Payload {
121122
// TODO: We'd like to use shadow bits to assert on kind
122123
return TypedInt(rawValue)
123124
}
125+
124126
private func interpretPair<👻>(
125127
secondAs: TypedInt<👻>.Type = TypedInt<👻>.self
126128
) -> (UInt64, TypedInt<👻>) {
@@ -212,6 +214,13 @@ extension Instruction.Payload {
212214
interpret()
213215
}
214216

217+
init(capture: CaptureRegister) {
218+
self.init(capture)
219+
}
220+
var capture: CaptureRegister {
221+
interpret()
222+
}
223+
215224

216225
// MARK: Packed operand payloads
217226

Sources/_MatchingEngine/Engine/Instruction.swift

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,24 @@ extension Instruction {
228228
///
229229
case splitSaving
230230

231+
/// Begin the given capture
232+
///
233+
/// beginCapture(_:CapReg)
234+
///
235+
case beginCapture
236+
237+
/// End the given capture
238+
///
239+
/// endCapture(_:CapReg)
240+
///
241+
case endCapture
242+
243+
/// Match a previously captured value
244+
///
245+
/// backreference(_:CapReg)
246+
///
247+
case backreference
248+
231249
// MARK: Matching: State transitions
232250

233251
// TODO: State transitions need more work. We want

0 commit comments

Comments
 (0)