Skip to content

Commit 4f14de3

Browse files
committed
Fix two crashers with quantification on transformed capture.
This patch fixes two issues: - When an optional quantification matches 1 occurrence, the capture type constructed by the engine unexpectedly drops the `Optional` wrapper type. `Capture.value` is fixed by including the `Optional` wrapper type. - When an optional or array quantification matches 0 occurrence, the capture type constructed by the engine unexpectedly uses `Substring` as the atom type instead of the transformed type from `CaptureTransform`. This is fixed by propagating capture transform result type information from the DSL API down to `captureNil` and `captureArray` instructions in the bytecode. The engine will use the type information to construct the correct types for `nil` and `[]`.
1 parent 83c94bf commit 4f14de3

File tree

6 files changed

+135
-31
lines changed

6 files changed

+135
-31
lines changed

Sources/_MatchingEngine/Regex/AST/AST.swift

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,13 @@ extension AST {
9494

9595
/// Whether this node has nested somewhere inside it a capture
9696
public var hasCapture: Bool {
97-
if case let .group(g) = self, g.kind.value.isCapturing {
97+
switch self {
98+
case .group(let g) where g.kind.value.isCapturing,
99+
.groupTransform(let g, _) where g.kind.value.isCapturing:
98100
return true
101+
default:
102+
break
99103
}
100-
101104
return self.children?.any(\.hasCapture) ?? false
102105
}
103106
}
@@ -207,14 +210,18 @@ extension AST {
207210

208211
// FIXME: Get this out of here
209212
public struct CaptureTransform: Equatable, Hashable, CustomStringConvertible {
213+
public let resultType: Any.Type
210214
public let closure: (Substring) -> Any
211215

212-
public init(_ closure: @escaping (Substring) -> Any) {
216+
public init(resultType: Any.Type, _ closure: @escaping (Substring) -> Any) {
217+
self.resultType = resultType
213218
self.closure = closure
214219
}
215220

216221
public func callAsFunction(_ input: Substring) -> Any {
217-
closure(input)
222+
let result = closure(input)
223+
assert(type(of: result) == resultType)
224+
return result
218225
}
219226

220227
public static func == (lhs: CaptureTransform, rhs: CaptureTransform) -> Bool {
@@ -229,7 +236,6 @@ public struct CaptureTransform: Equatable, Hashable, CustomStringConvertible {
229236
}
230237

231238
public var description: String {
232-
"<transform>"
239+
"<transform result_type=\(resultType)>"
233240
}
234241
}
235-

Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
// A tree representing the type of some captures.
1313
public enum CaptureStructure: Equatable {
14-
case atom(name: String? = nil)
14+
case atom(name: String? = nil, type: AnyType? = nil)
1515
indirect case array(CaptureStructure)
1616
indirect case optional(CaptureStructure)
1717
indirect case tuple([CaptureStructure])
@@ -47,6 +47,17 @@ extension AST {
4747
default:
4848
return innerCaptures
4949
}
50+
case .groupTransform(let group, let transform):
51+
let innerCaptures = group.child.captureStructure
52+
switch group.kind.value {
53+
case .capture:
54+
return .atom(type: AnyType(transform.resultType)) + innerCaptures
55+
case .namedCapture(let name):
56+
return .atom(name: name.value, type: AnyType(transform.resultType))
57+
+ innerCaptures
58+
default:
59+
return innerCaptures
60+
}
5061
case .conditional(let c):
5162
// A conditional's capture structure is effectively that of an alternation
5263
// between the true and false branches. However the condition may also
@@ -67,8 +78,6 @@ extension AST {
6778
quantification.amount.value == .zeroOrOne
6879
? CaptureStructure.optional
6980
: CaptureStructure.array)
70-
case .groupTransform:
71-
fatalError("Unreachable. Case will be removed later.")
7281
case .quote, .trivia, .atom, .customCharacterClass, .empty:
7382
return .empty
7483
}
@@ -135,8 +144,10 @@ extension CaptureStructure {
135144

136145
public func type(withAtomType atomType: Any.Type) -> Any.Type {
137146
switch self {
138-
case .atom:
147+
case .atom(_, type: nil):
139148
return atomType
149+
case .atom(_, type: let type?):
150+
return type.base
140151
case .array(let child):
141152
return TypeConstruction.arrayType(of: child.type(withAtomType: atomType))
142153
case .optional(let child):
@@ -213,16 +224,18 @@ extension CaptureStructure {
213224
func encode(_ node: CaptureStructure, isTopLevel: Bool = false) {
214225
switch node {
215226
// 〚`T` (atom)〛 ==> .atom
216-
case .atom(name: nil):
227+
case .atom(name: nil, type: nil):
217228
append(.atom)
218229
// 〚`name: T` (atom)〛 ==> .atom, `name`, '\0'
219-
case .atom(name: let name?):
230+
case .atom(name: let name?, type: nil):
220231
append(.namedAtom)
221232
let nameCString = name.utf8CString
222233
let nameSlot = UnsafeMutableRawBufferPointer(
223234
rebasing: buffer[offset ..< offset+nameCString.count])
224235
nameCString.withUnsafeBytes(nameSlot.copyMemory(from:))
225236
offset += nameCString.count
237+
case .atom(_, _?):
238+
fatalError("Cannot encode a capture structure with explicit types")
226239
// 〚`[T]`〛 ==> 〚`T`〛, .formArray
227240
case .array(let child):
228241
encode(child)

Sources/_MatchingEngine/Utility/Misc.swift

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,19 @@ extension BinaryInteger {
149149
}
150150
}
151151

152+
/// A wrapper of an existential metatype, equatable and hashable by reference.
153+
public struct AnyType: Equatable, Hashable {
154+
public var base: Any.Type
155+
156+
public init(_ type: Any.Type) {
157+
base = type
158+
}
159+
160+
public static func == (lhs: AnyType, rhs: AnyType) -> Bool {
161+
lhs.base == rhs.base
162+
}
163+
164+
public func hash(into hasher: inout Hasher) {
165+
hasher.combine(ObjectIdentifier(base))
166+
}
167+
}

Sources/_StringProcessing/Capture.swift

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,10 @@ extension Capture {
5151
}
5252
return _openExistential(childType.base, do: helper)
5353
case .some(let subcapture):
54-
return subcapture.value
54+
func helper<T>(_ value: T) -> Any {
55+
Optional(value) as Any
56+
}
57+
return _openExistential(subcapture.value, do: helper)
5558
case .none(let childType):
5659
func helper<T>(_: T.Type) -> Any {
5760
nil as T? as Any

Sources/_StringProcessing/RegexDSL/DSL.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ public struct CapturingGroup<Match: MatchProtocol>: RegexProtocol {
179179
self.regex = .init(ast:
180180
.groupTransform(
181181
.init(.init(faking: .capture), component.regex.ast, .fake),
182-
transform: CaptureTransform {
182+
transform: CaptureTransform(resultType: NewCapture.self) {
183183
transform($0) as Any
184184
}))
185185
}

Tests/RegexTests/RegexDSLTests.swift

Lines changed: 83 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,55 @@ class RegexDSLTests: XCTestCase {
8080
== Tuple3("b", "cccc", ["d", "d", "d"]))
8181
}
8282

83+
func testQuantificationWithTransformedCapture() throws {
84+
// This test is to make sure transformed capture type information is
85+
// correctly propagated from the DSL into the bytecode and that the engine
86+
// is reconstructing the right types upon quantification (both empty and
87+
// non-empty).
88+
enum Word: Int32 {
89+
case apple
90+
case orange
91+
92+
init?(_ string: Substring) {
93+
switch string {
94+
case "apple": self = .apple
95+
case "orange": self = .orange
96+
default: return nil
97+
}
98+
}
99+
}
100+
let regex = Regex {
101+
"a".+
102+
OneOrMore(.whitespace)
103+
Optionally {
104+
OneOrMore(.digit).capture { Int($0)! }
105+
}
106+
Repeat {
107+
OneOrMore(.whitespace)
108+
OneOrMore(.word).capture { Word($0)! }
109+
}
110+
}
111+
// Assert the inferred capture type.
112+
let _: Tuple3<Substring, Int?, [Word]>.Type
113+
= type(of: regex).Match.self
114+
do {
115+
let input = "aaa 123 apple orange apple"
116+
let match = input.match(regex)?.match.tuple
117+
let (whole, number, words) = try XCTUnwrap(match)
118+
XCTAssertTrue(whole == input)
119+
XCTAssertEqual(number, 123)
120+
XCTAssertEqual(words, [.apple, .orange, .apple])
121+
}
122+
do {
123+
let input = "aaa "
124+
let match = input.match(regex)?.match.tuple
125+
let (whole, number, words) = try XCTUnwrap(match)
126+
XCTAssertTrue(whole == input)
127+
XCTAssertEqual(number, nil)
128+
XCTAssertTrue(words.isEmpty)
129+
}
130+
}
131+
83132
// Note: Types of nested captures should be flat, but are currently nested
84133
// due to the lack of variadic generics. Without it, we cannot effectively
85134
// express type constraints to concatenate splatted tuples.
@@ -174,39 +223,46 @@ class RegexDSLTests: XCTestCase {
174223
let line = """
175224
A6F0..A6F1 ; Extend # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS
176225
"""
177-
let regex = Regex {
178-
OneOrMore(CharacterClass.hexDigit).capture()
226+
227+
let regexWithCapture = Regex {
228+
OneOrMore(CharacterClass.hexDigit).capture(Unicode.Scalar.init(hex:))
179229
Optionally {
180230
".."
181-
OneOrMore(CharacterClass.hexDigit).capture()
231+
OneOrMore(CharacterClass.hexDigit).capture(Unicode.Scalar.init(hex:))
182232
}
183233
OneOrMore(CharacterClass.whitespace)
184234
";"
185235
OneOrMore(CharacterClass.whitespace)
186236
OneOrMore(CharacterClass.word).capture()
187237
Repeat(CharacterClass.any)
238+
} // Regex<(Substring, Unicode.Scalar?, Unicode.Scalar??, Substring)>
239+
do {
240+
// Assert the inferred capture type.
241+
typealias ExpectedMatch = Tuple4<
242+
Substring, Unicode.Scalar?, Unicode.Scalar??, Substring
243+
>
244+
let _: ExpectedMatch.Type = type(of: regexWithCapture).Match.self
245+
let maybeMatchResult = line.match(regexWithCapture)
246+
let matchResult = try XCTUnwrap(maybeMatchResult)
247+
let (wholeMatch, lower, upper, propertyString) = matchResult.match.tuple
248+
XCTAssertEqual(wholeMatch, Substring(line))
249+
XCTAssertEqual(lower, Unicode.Scalar(0xA6F0))
250+
XCTAssertEqual(upper, Unicode.Scalar(0xA6F1))
251+
XCTAssertEqual(propertyString, "Extend")
188252
}
189-
// Assert the inferred capture type.
190-
typealias ExpectedMatch = Tuple4<
191-
Substring, Substring, Substring?, Substring
192-
>
193-
let _: ExpectedMatch.Type = type(of: regex).Match.self
194-
func run<R: RegexProtocol>(
195-
_ regex: R
196-
) throws where R.Match == ExpectedMatch {
197-
let maybeMatchResult = line.match(regex)
253+
254+
do {
255+
let regexLiteral = try MockRegexLiteral(
256+
#"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#,
257+
matching: Tuple4<Substring, Substring, Substring?, Substring>.self)
258+
let maybeMatchResult = line.match(regexLiteral)
198259
let matchResult = try XCTUnwrap(maybeMatchResult)
199260
let (wholeMatch, lower, upper, propertyString) = matchResult.match.tuple
200261
XCTAssertEqual(wholeMatch, Substring(line))
201262
XCTAssertEqual(lower, "A6F0")
202263
XCTAssertEqual(upper, "A6F1")
203264
XCTAssertEqual(propertyString, "Extend")
204265
}
205-
let regexLiteral = try MockRegexLiteral(
206-
#"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#,
207-
matching: Tuple4<Substring, Substring, Substring?, Substring>.self)
208-
try run(regex)
209-
try run(regexLiteral)
210266
}
211267

212268
func testDynamicCaptures() throws {
@@ -233,3 +289,13 @@ class RegexDSLTests: XCTestCase {
233289
}
234290
}
235291
}
292+
293+
extension Unicode.Scalar {
294+
// Convert a hexadecimal string to a scalar
295+
public init?<S: StringProtocol>(hex: S) {
296+
guard let val = UInt32(hex, radix: 16), let scalar = Self(val) else {
297+
return nil
298+
}
299+
self = scalar
300+
}
301+
}

0 commit comments

Comments
 (0)