From 81d206aa613d24f834ae4e332578f9c4d6b915bc Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Mon, 21 Feb 2022 16:58:42 -0700 Subject: [PATCH 1/3] Simplify Capture and DynamicCaptures storage Change to an array of structs instead of recursive indirect enums. --- .../Regex/Parse/CaptureStructure.swift | 35 +- Sources/_StringProcessing/Capture.swift | 135 +++----- .../_StringProcessing/Engine/Registers.swift | 7 - .../Engine/StringProcessor.swift | 8 +- .../Engine/Structuralize.swift | 209 ++++-------- Sources/_StringProcessing/Executor.swift | 7 +- .../RegexDSL/DynamicCaptures.swift | 48 +-- .../_StringProcessing/RegexDSL/Match.swift | 12 +- Tests/RegexTests/CaptureTests.swift | 304 ++++++++++-------- Tests/RegexTests/ParseTests.swift | 6 +- Tests/RegexTests/RegexDSLTests.swift | 18 +- 11 files changed, 348 insertions(+), 441 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift b/Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift index 6ea5b5960..3ec5860e6 100644 --- a/Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift +++ b/Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift @@ -12,7 +12,6 @@ // A tree representing the type of some captures. public enum CaptureStructure: Equatable { case atom(name: String? = nil, type: AnyType? = nil) - indirect case array(CaptureStructure) indirect case optional(CaptureStructure) indirect case tuple([CaptureStructure]) @@ -265,8 +264,6 @@ extension CaptureStructure { return atomType case .atom(_, type: let type?): return type.base - case .array(let child): - return TypeConstruction.arrayType(of: child.type(withAtomType: atomType)) case .optional(let child): return TypeConstruction.optionalType(of: child.type(withAtomType: atomType)) case .tuple(let children): @@ -281,6 +278,20 @@ extension CaptureStructure { public var type: Any.Type { type(withAtomType: DefaultAtomType.self) } + + public var atomType: AnyType { + switch self { + case .atom(_, type: nil): + return .init(Substring.self) + case .atom(_, type: let type?): + return type + case .optional(let child): + return child.atomType + case .tuple: + fatalError("Recursive nesting has no single atom type") + } + + } } // MARK: - Serialization @@ -291,7 +302,7 @@ extension CaptureStructure { case end = 0 case atom = 1 case namedAtom = 2 - case formArray = 3 +// case formArray = 3 case formOptional = 4 case beginTuple = 5 case endTuple = 6 @@ -314,7 +325,6 @@ extension CaptureStructure { /// encode(〚`T`〛) ==> , 〚`T`〛, .end /// 〚`T` (atom)〛 ==> .atom /// 〚`name: T` (atom)〛 ==> .atom, `name`, '\0' - /// 〚`[T]`〛 ==> 〚`T`〛, .formArray /// 〚`T?`〛 ==> 〚`T`〛, .formOptional /// 〚`(T0, T1, ...)` (top level)〛 ==> 〚`T0`〛, 〚`T1`〛, ... /// 〚`(T0, T1, ...)`〛 ==> .beginTuple, 〚`T0`〛, 〚`T1`〛, ..., .endTuple @@ -334,7 +344,8 @@ extension CaptureStructure { var offset = MemoryLayout.stride /// Appends a code to the buffer, advancing the offset to the next position. func append(_ code: Code) { - buffer.storeBytes(of: code, toByteOffset: offset, as: Code.self) + buffer.storeBytes( + of: code.rawValue, toByteOffset: offset, as: UInt8.self) offset += MemoryLayout.stride } /// Recursively encode the node to the buffer. @@ -353,10 +364,6 @@ extension CaptureStructure { offset += nameCString.count case .atom(_, _?): fatalError("Cannot encode a capture structure with explicit types") - // 〚`[T]`〛 ==> 〚`T`〛, .formArray - case .array(let child): - encode(child) - append(.formArray) // 〚`T?`〛 ==> 〚`T`〛, .formOptional case .optional(let child): encode(child) @@ -419,9 +426,6 @@ extension CaptureStructure { let name = String(cString: stringAddress) offset += name.utf8CString.count currentScope.append(.atom(name: name)) - case .formArray: - let lastIndex = currentScope.endIndex - 1 - currentScope[lastIndex] = .array(currentScope[lastIndex]) case .formOptional: let lastIndex = currentScope.endIndex - 1 currentScope[lastIndex] = .optional(currentScope[lastIndex]) @@ -454,11 +458,6 @@ extension CaptureStructure: CustomStringConvertible { : String(describing: type) printer.print("Atom(\(name): \(type))") - case let .array(c): - printer.printBlock("Array") { printer in - c._print(&printer) - } - case let .optional(c): printer.printBlock("Optional") { printer in c._print(&printer) diff --git a/Sources/_StringProcessing/Capture.swift b/Sources/_StringProcessing/Capture.swift index 3b2f20ac2..c9b75cdfd 100644 --- a/Sources/_StringProcessing/Capture.swift +++ b/Sources/_StringProcessing/Capture.swift @@ -11,112 +11,57 @@ import _MatchingEngine -// TODO: what here should be in the compile-time module? +/// A structured capture +struct StructuredCapture { + /// The `.optional` height of the result + var numOptionals = 0 -enum Capture { - case atom(Any) - indirect case tuple([Capture]) - indirect case some(Capture) - case none(childType: AnyType) - indirect case array([Capture], childType: AnyType) -} - -extension Capture { - static func none(childType: Any.Type) -> Capture { - .none(childType: AnyType(childType)) - } + var storedCapture: StoredCapture? - static func array(_ children: [Capture], childType: Any.Type) -> Capture { - .array(children, childType: AnyType(childType)) + var numSomes: Int { + storedCapture == nil ? numOptionals - 1 : numOptionals } } -extension Capture { - static func tupleOrAtom(_ elements: [Capture]) -> Self { - elements.count == 1 ? elements[0] : .tuple(elements) - } +/// A storage form for a successful capture +struct StoredCapture { + // TODO: drop optional when engine tracks all ranges + var range: Range? - static var void: Capture { - .tuple([]) - } + // If strongly typed, value is set + var value: Any? = nil +} - var value: Any { - switch self { - case .atom(let atom): - return atom - case .tuple(let elements): - return TypeConstruction.tuple( - of: elements.map(\.value)) - case .array(let elements, let childType): - func helper(_: T.Type) -> Any { - elements.map { $0.value as! T } - } - return _openExistential(childType.base, do: helper) - case .some(let subcapture): - func helper(_ value: T) -> Any { - Optional(value) as Any - } - return _openExistential(subcapture.value, do: helper) - case .none(let childType): - func helper(_: T.Type) -> Any { - nil as T? as Any - } - return _openExistential(childType.base, do: helper) +extension StructuredCapture { + func extractExistentialMatchComponent( + from input: Substring + ) -> Any { + var underlying: Any + if let cap = self.storedCapture { + underlying = cap.value ?? input[cap.range!] + } else { + // Ok since we Any-box every step up the ladder + underlying = Optional(nil) as Any } - } - - private func prepending(_ newElement: Any) -> Self { - switch self { - case .atom, .some, .none, .array: - return .tuple([.atom(newElement), self]) - case .tuple(let elements): - return .tuple([.atom(newElement)] + elements) + for _ in 0.. Any { - prepending(wholeMatch).value + return underlying } } -extension Capture: CustomStringConvertible { - public var description: String { - var printer = PrettyPrinter() - _print(&printer) - return printer.finish() - } - - private func _print(_ printer: inout PrettyPrinter) { - switch self { - case let .atom(n): - printer.print("Atom(\(n))") - case let .tuple(ns): - if ns.isEmpty { - printer.print("Tuple()") - return - } - - printer.printBlock("Tuple") { printer in - for n in ns { - n._print(&printer) - } - } - - case let .some(n): - printer.printBlock("Some") { printer in - n._print(&printer) - } - - case let .none(childType): - printer.print("None(\(childType))") - - case let .array(ns, childType): - printer.printBlock("Array(\(childType))") { printer in - for n in ns { - n._print(&printer) - } - } - - } +extension Sequence where Element == StructuredCapture { + // FIXME: This is a stop gap where we still slice the input + // and traffic through existentials + func extractExistentialMatch( + from input: Substring + ) -> Any { + var caps = Array() + caps.append(input) + caps.append(contentsOf: self.map { + $0.extractExistentialMatchComponent(from: input) + }) + return TypeConstruction.tuple(of: caps) } } + diff --git a/Sources/_StringProcessing/Engine/Registers.swift b/Sources/_StringProcessing/Engine/Registers.swift index ca3e07bfe..2cc3cddf5 100644 --- a/Sources/_StringProcessing/Engine/Registers.swift +++ b/Sources/_StringProcessing/Engine/Registers.swift @@ -88,13 +88,6 @@ extension Processor { subscript(_ i: ValueRegister) -> Any { get { values[i.rawValue] } set { - print(""" - values: \(values) - i: \(i) - newValue: \(newValue) - """) - print(values) - print(i) values[i.rawValue] = newValue } } diff --git a/Sources/_StringProcessing/Engine/StringProcessor.swift b/Sources/_StringProcessing/Engine/StringProcessor.swift index 042c81180..14fe02681 100644 --- a/Sources/_StringProcessing/Engine/StringProcessor.swift +++ b/Sources/_StringProcessing/Engine/StringProcessor.swift @@ -14,16 +14,18 @@ typealias Program = MEProgram public struct MatchResult { public var range: Range - var captures: Capture + var captures: [StructuredCapture] var destructure: ( - matched: Range, captures: Capture + matched: Range, + captures: [StructuredCapture] ) { (range, captures) } init( - _ matched: Range, _ captures: Capture + _ matched: Range, + _ captures: [StructuredCapture] ) { self.range = matched self.captures = captures diff --git a/Sources/_StringProcessing/Engine/Structuralize.swift b/Sources/_StringProcessing/Engine/Structuralize.swift index e02a9003d..7410f69fb 100644 --- a/Sources/_StringProcessing/Engine/Structuralize.swift +++ b/Sources/_StringProcessing/Engine/Structuralize.swift @@ -1,173 +1,78 @@ import _MatchingEngine -private enum StructureKind { - case optional - case history - case latest -} -extension StructureKind: CustomStringConvertible { - var description: String { +extension CaptureStructure { + var numOptionals: Int { switch self { - case .optional: return "optional" - case .history: return "history" - case .latest: return "latest" - } - } -} - -// TODO: How stateful is this, really? -// TODO: Should we build up a result more mutably? -private struct Fabricator { - var list: CaptureList - let input: String - - var curIdx = 0 - - // TODO: We may just need to know whether we're - // history mapping or not... - var structStack: Array = [] - - mutating func next( - ) throws -> Processor._StoredCapture { - guard curIdx < list.caps.endIndex else { - // TODO: Is `throws` a bit much here? - // Maybe just precondition or hard trap - throw Unreachable("Capture count mismatch") - } - defer { list.caps.formIndex(after: &curIdx) } - return list.caps[curIdx] - } - - var currentIsEmpty: Bool { - guard curIdx < list.caps.endIndex else { - fatalError("Capture count mismatch") + case .atom: return 0 + case .optional(let o): + return 1 + o.numOptionals + case .tuple: + // FIXME: Separate CaptureStructure and a component + fatalError("Recursive nesting") + @unknown default: + fatalError("Unknown default") } - - return list.caps[curIdx].isEmpty } - mutating func formValue( - _ t: AnyType - ) throws -> Capture { - let cap = try next() - - switch structStack.last { - case nil, .latest: - guard let v = cap.latestValue else { - // TODO: Should we actually be tracking whether there - // were any optionals along the way, or just the latest - // kind? - throw Unreachable("No actual capture recorded") - } - guard type(of: v) == t.base else { - throw Unreachable("Type mismatch") - } - return .atom(v) - - case .history: - let hist = try cap.valueHistory.map { v -> Capture in - guard type(of: v) == t.base else { - throw Unreachable("Type mismatch") + // FIXME: Do it all in one pass, no need for all these + // intermediary arrays + func structuralize( + _ list: CaptureList, + _ input: String + ) throws -> [StructuredCapture] { + + func mapCap( + _ cap: CaptureStructure, + _ storedCap: Processor._StoredCapture + ) -> StructuredCapture { + // TODO: CaptureList perhaps should store a + // metatype or relevant info... + let numOptionals = cap.numOptionals + + if cap.atomType.base == Substring.self { + // FIXME: What if a typed capture is Substring? + assert(!storedCap.hasValues) + + if let r = storedCap.latest { + return StructuredCapture( + numOptionals: numOptionals, + storedCapture: StoredCapture(range: r)) } - return .atom(v) - } - return .array(hist, childType: t) - case .optional: - // FIXME: We actually need to know if there's any array - // above us to know whether to propagate/map-over history - // at every step. - - if cap.valueHistory.isEmpty { - return .none(childType: t) - } - guard let v = cap.latestValue else { - // TODO: Should we actually be tracking whether there - // were any optionals along the way, or just the latest - // kind? - throw Unreachable("No actual capture recorded") - } - guard type(of: v) == t.base else { - throw Unreachable("Type mismatch") + return StructuredCapture( + numOptionals: numOptionals, + storedCapture: nil) } - return .some(.atom(v)) - } - } - - mutating func formSlice( - ) throws -> Capture { - let cap = try next() - switch structStack.last { - case nil, .latest: - guard let r = cap.latest else { - // TODO: Should we actually be tracking whether there - // were any optionals along the way, or just the latest - // kind? - throw Unreachable("No actual capture recorded") + guard (storedCap.isEmpty || storedCap.hasValues) else { + print(storedCap) + fatalError() } - return .atom(input[r]) + // TODO: assert types are the same, under all the + // optionals - case .history: - let hist = cap.history.map { r -> Capture in - return .atom(input[r]) + if let v = storedCap.latestValue { + return StructuredCapture( + numOptionals: numOptionals, + storedCapture: StoredCapture(range: storedCap.latest, value: v)) } - return .array(hist, childType: Substring.self) - - case .optional: - guard let r = cap.history.last else { - return .none(childType: Substring.self) - } - return .some(.atom(input[r])) + return StructuredCapture( + numOptionals: numOptionals, + storedCapture: nil) } - } -} - -extension CaptureStructure { - func structuralize( - _ list: CaptureList, - _ input: String - ) throws -> Capture { - var fab = Fabricator(list: list, input: input) - return try _structuralize(&fab) - } - private func _structuralize( - _ fab: inout Fabricator - ) throws -> Capture { switch self { - case let .atom(name, type): - // TODO: names - guard name == nil else { - throw Unsupported("names...") + case let .tuple(caps): + assert(list.caps.count == caps.count) + var result = Array() + for (cap, storedCap) in zip(caps, list.caps) { + result.append(mapCap(cap, storedCap)) } + return result - if let t = type { - return try fab.formValue(t) - } - return try fab.formSlice() - - case let .array(a): - fab.structStack.append(.history) - defer { fab.structStack.removeLast() } - return try a._structuralize(&fab) - - case let .optional(o): - // NOTE: This has the effect of flattening nested - // optionals. Not sure what we actually want here. - // - // Also, this will not add optional to nested types, - // again not sure what we want... - fab.structStack.append(.optional) - defer { fab.structStack.removeLast() } - return try o._structuralize(&fab) - - case let .tuple(t): - let members = try t.map { try $0._structuralize(&fab) } - return .tuple(members) - - @unknown default: - throw Unreachable("Version mismatch with parser") + default: + assert(list.caps.count == 1) + return [mapCap(self, list.caps.first!)] } } } diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 2a3ca5e29..9bff0fa78 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -31,8 +31,11 @@ public struct Executor { } let capStruct = engine.program.captureStructure do { - let caps = try capStruct.structuralize(capList, input) - return MatchResult(range.lowerBound.. + diff --git a/Sources/_StringProcessing/RegexDSL/Match.swift b/Sources/_StringProcessing/RegexDSL/Match.swift index f77d9c3b4..5a8ce37ed 100644 --- a/Sources/_StringProcessing/RegexDSL/Match.swift +++ b/Sources/_StringProcessing/RegexDSL/Match.swift @@ -50,12 +50,16 @@ extension RegexProtocol { } let convertedMatch: Match if Match.self == (Substring, DynamicCaptures).self { - convertedMatch = (input[range], DynamicCaptures(captures)) as! Match - } else if Match.self == Substring.self { + let dynCaps = captures.map { + DynamicCapture($0, in: input) + } + convertedMatch = (input[range], dynCaps) as! Match + } else + if Match.self == Substring.self { convertedMatch = input[range] as! Match } else { - let typeErasedMatch = captures.matchValue( - withWholeMatch: input[range] + let typeErasedMatch = captures.extractExistentialMatch( + from: input[range] ) convertedMatch = typeErasedMatch as! Match } diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 46fbe38da..88cbe59cf 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -3,41 +3,93 @@ import XCTest @testable import _StringProcessing import _MatchingEngine -extension Capture: ExpressibleByStringLiteral { - fileprivate init(_ s: String) { - self = .atom(s[...]) +extension StructuredCapture { + func formatStringCapture(input: String) -> String { + var res = String(repeating: "some(", count: numSomes) + if let r = self.storedCapture?.range { + res += input[r] + } else { + res += "none" + } + res += String(repeating: ")", count: numSomes) + return res } - public init(stringLiteral: String) { - self.init(stringLiteral) +} + +extension Sequence where Element == StructuredCapture { + func formatStringCaptures(input: String) -> String { + var res = "[" + res += self.map { + $0.formatStringCapture(input: input) + }.joined(separator: ", ") + res += "]" + return res } } -// TODO: Move `flatCaptureTest`s over here too... +struct StringCapture { + var contents: String? + var numOptionals: Int + + var numSomes: Int { + contents == nil ? numOptionals - 1 : numOptionals + } + + static var none: Self { + self.init(contents: nil, numOptionals: 1) + } + static func some(_ s: Self) -> Self { + self.init( + contents: s.contents, numOptionals: s.numOptionals+1) + } +} -private func isEqual(_ lhs: Capture, _ rhs: Capture) -> Bool { - switch (lhs, rhs) { - case let (.atom(a), .atom(b)): - // FIXME: Needed because "a" != "a"[...] existentially - let lhsStr = String(describing: a) - let rhsStr = String(describing: b) - - // :-( - return lhsStr == rhsStr - - case let (.tuple(a), .tuple(b)): - return zip(a, b).map(isEqual).all({$0}) - case let (.some(a), .some(b)): - return isEqual(a, b) - case let (.none(a), .none(b)): - return a == b - case let (.array(a, tA), .array(b, tB)): - let contentsCompare = zip(a, b).map(isEqual).all({$0}) - return contentsCompare && tA == tB - - default: return false +extension StringCapture: ExpressibleByStringLiteral { + init(stringLiteral: String) { + self.contents = stringLiteral + self.numOptionals = 0 + } +} + +extension StringCapture: CustomStringConvertible { + var description: String { + var res = String(repeating: "some(", count: numSomes) + if let s = self.contents { + res += s + } else { + res += "none" + } + res += String(repeating: ")", count: numSomes) + return res + } +} + +extension StringCapture { + func isEqual( + to structCap: StructuredCapture, + in input: String + ) -> Bool { + guard numOptionals == structCap.numOptionals else { + return false + } + guard let r = structCap.storedCapture?.range else { + return contents == nil + } + guard let s = contents else { + return false + } + return input[r] == s } } +// NOTE: These tests are not tests of type-construction logic +// (e.g. making sure we actually have the right number of +// Optional wrappers), because we test equivalence a little +// before that step. + + +// TODO: Move `flatCaptureTest`s over here too... + func compile(_ ast: AST) -> Executor { let tree = ast.dslTree let prog = try! Compiler(tree: tree).emit() @@ -48,7 +100,7 @@ func compile(_ ast: AST) -> Executor { func captureTest( _ regex: String, _ expected: CaptureStructure, - _ tests: (input: String, output: Capture)..., + _ tests: (input: String, output: [StringCapture])..., skipEngine: Bool = false, file: StaticString = #file, line: UInt = #line @@ -94,22 +146,30 @@ func captureTest( input: input, in: inputRange, mode: .wholeString )! - let cap = try! capStructure.structuralize(capFlat, input) + let caps = try! capStructure.structuralize( + capFlat, input) + + guard caps.count == output.count else { + XCTFail(""" + Mismatch capture count: + Expected: + \(output) + Seen: + \(caps.formatStringCaptures(input: input)) + """) + continue + } - guard isEqual(cap, output) else { + guard output.elementsEqual(caps, by: { + $0.isEqual(to: $1, in: input) + }) else { XCTFail(""" - regex: \(regex), input: "\(input)" - Structure: - \(capStructure) - Capture list: - \(capFlat.latestUntyped(from: input)) - Expected: - \(output) - Actual: - \(cap) - """, - file: file, - line: line) + Mismatch capture count: + Expected: + \(output) + Seen: + \(caps.formatStringCaptures(input: input)) + """) continue } } @@ -118,89 +178,64 @@ func captureTest( extension RegexTests { func testLiteralStructuredCaptures() throws { - func some(_ c: Capture) -> Capture { - .some(c) - } - - func array(_ cs: Capture...) -> Capture { - .array(cs, childType: Substring.self) - } - func someArray(_ cs: Capture...) -> Capture { - .some(.array(cs, childType: Substring.self)) - } - - func tuple(_ ss: Capture...) -> Capture { - .tuple(ss) - } - - var none: Capture { - .none(childType: Substring.self) - } - var noArray: Capture { - .none(childType: [Substring].self) - } - var noOpt: Capture { - .none(childType: Substring?.self) - } - captureTest( "abc", .empty, - ("abc", .void)) + ("abc", [])) captureTest( "a(b)c", .atom(), - ("abc", "b")) + ("abc", ["b"])) captureTest( "a(b*)c", .atom(), - ("abc", "b"), - ("ac", ""), - ("abbc", "bb")) + ("abc", ["b"]), + ("ac", [""]), + ("abbc", ["bb"])) captureTest( "a(b)*c", .optional(.atom()), - ("abc", some("b")), - ("ac", none), - ("abbc", some("b"))) + ("abc", [.some("b")]), + ("ac", [.none]), + ("abbc", [.some("b")])) captureTest( "a(b)+c", .atom(), - ("abc", "b"), - ("abbc", "b")) + ("abc", ["b"]), + ("abbc", ["b"])) captureTest( "a(b)?c", .optional(.atom()), - ("ac", none), - ("abc", some("b"))) + ("ac", [.none]), + ("abc", [.some("b")])) captureTest( "(a)(b)(c)", .tuple([.atom(),.atom(),.atom()]), - ("abc", tuple("a", "b", "c"))) + ("abc", ["a", "b", "c"])) captureTest( "a|(b)", .optional(.atom()), - ("a", none), - ("b", some("b"))) + ("a", [.none]), + ("b", [.some("b")])) captureTest( "(a)|(b)", .tuple(.optional(.atom()), .optional(.atom())), - ("a", tuple(some("a"), none)), - ("b", tuple(none, some("b")))) + ("a", [.some("a"), .none]), + ("b", [.none, .some("b")])) captureTest( "((a)|(b))", .tuple(.atom(), .optional(.atom()), .optional(.atom())), - ("a", tuple("a", some("a"), none)), - ("b", tuple("b", none, some("b")))) + ("a", ["a", .some("a"), .none]), + ("b", ["b", .none, .some("b")])) captureTest( "((a)|(b))?", @@ -208,8 +243,8 @@ extension RegexTests { .optional(.atom()), .optional(.optional(.atom())), .optional(.optional(.atom()))), - ("a", tuple(some("a"), some("a"), none)), - ("b", tuple(some("b"), none, some("b")))) + ("a", [.some("a"), .some(.some("a")), .some(.none)]), + ("b", [.some("b"), .some(.none), .some(.some("b"))])) captureTest( "((a)|(b))*", @@ -217,7 +252,7 @@ extension RegexTests { .optional(.atom()), .optional(.optional(.atom())), .optional(.optional(.atom()))), - ("a", tuple(some("a"), some(some("a")), none)), + ("a", [.some("a"), .some(.some("a")), .some(.none)]), skipEngine: true) captureTest( @@ -253,17 +288,17 @@ extension RegexTests { captureTest( "(a)", .atom(), - ("a", "a")) + ("a", ["a"])) captureTest( "((a))", .tuple([.atom(), .atom()]), - ("a", tuple("a", "a"))) + ("a", ["a", "a"])) captureTest( "(((a)))", .tuple([.atom(), .atom(), .atom()]), - ("a", tuple("a", "a", "a"))) + ("a", ["a", "a", "a"])) // broke @@ -282,110 +317,111 @@ extension RegexTests { captureTest( "a|(b*)", .optional(.atom()), - ("a", none), - ("", some("")), - ("b", some("b")), - ("bbb", some("bbb"))) + ("a", [.none]), + ("", [.some("")]), + ("b", [.some("b")]), + ("bbb", [.some("bbb")])) captureTest( "a|(b)*", .optional(.optional(.atom())), - ("a", none), - ("", someArray()), - ("b", someArray("b")), - ("bbb", someArray("b", "b", "b")), + ("a", [.none]), + ("", [.some("")]), + ("b", [.some("b")]), + ("bbb", [.some("b")]), skipEngine: true) captureTest( "a|(b)+", .optional(.atom()), - ("a", noArray), - ("b", someArray("b")), - ("bbb", someArray("b", "b", "b")), + ("a", [.none]), + ("b", [.some("b")]), + ("bbb", [.some("b")]), skipEngine: true) captureTest( "a|(b)?", .optional(.optional(.atom())), - ("a", noOpt), - ("", noOpt), - ("b", .some(some("b"))), + ("a", [.none]), + ("", [.none]), + ("b", [.some(.some("b"))]), skipEngine: true) captureTest( "a|(b|c)", .optional(.atom()), - ("a", none), - ("b", some("b")), - ("c", some("c"))) + ("a", [.none]), + ("b", [.some("b")]), + ("c", [.some("c")])) captureTest( "a|(b*|c)", .optional(.atom()), - ("a", none), - ("b", some("b")), - ("c", some("c"))) + ("a", [.none]), + ("b", [.some("b")]), + ("c", [.some("c")])) captureTest( "a|(b|c)*", .optional(.optional(.atom())), - ("a", noArray), - ("", noArray), - ("b", someArray("b")), - ("bbb", someArray("b", "b", "b")), + ("a", [.none]), + ("", [.some("")]), + ("b", [.some("b")]), + ("bbb", [.some("b")]), skipEngine: true) captureTest( "a|(b|c)?", .optional(.optional(.atom())), - ("a", noOpt), - ("", noOpt), - ("b", .some(some("b"))), - ("c", .some(some("c"))), + ("a", [.none]), + ("", [.none]), + ("b", [.some(.some("b"))]), + ("c", [.some(.some("c"))]), skipEngine: true) captureTest( "a(b(c))", .tuple(.atom(), .atom()), - ("abc", tuple("bc", "c"))) + ("abc", ["bc", "c"])) captureTest( "a(b(c*))", .tuple(.atom(), .atom()), - ("ab", tuple("b", "")), - ("abc", tuple("bc", "c")), - ("abcc", tuple("bcc", "cc"))) + ("ab", ["b", ""]), + ("abc", ["bc", "c"]), + ("abcc", ["bcc", "cc"])) captureTest( "a(b(c)*)", .tuple(.atom(), .optional(.atom())), - ("ab", tuple("b", none)), - ("abc", tuple("bc", some("c"))), - ("abcc", tuple("bcc", some("c")))) + ("ab", ["b", .none]), + ("abc", ["bc", .some("c")]), + ("abcc", ["bcc", .some("c")])) captureTest( "a(b(c)?)", .tuple(.atom(), .optional(.atom())), - ("ab", tuple("b", none)), - ("abc", tuple("bc", some("c")))) + ("ab", ["b", .none]), + ("abc", ["bc", .some("c")])) captureTest( "a(b(c))*", .tuple(.optional(.atom()), .optional(.atom())), - ("a", tuple(none, none)), - ("abc", tuple(some("bc"), some("c"))), - ("abcbc", tuple(some("bc"), some("c")))) + ("a", [.none, .none]), + ("abc", [.some("bc"), .some("c")]), + ("abcbc", [.some("bc"), .some("c")])) captureTest( "a(b(c))?", .tuple(.optional(.atom()), .optional(.atom())), - ("a", tuple(none, none)), - ("abc", tuple(some("bc"), some("c")))) + ("a", [.none, .none]), + ("abc", [.some("bc"), .some("c")])) // TODO: "((a|b)*|c)*" // TODO: "((a|b)|c)*" + } } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 4872e256e..72c35b3f9 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -88,7 +88,11 @@ func parseTest( guard let decodedCaptures = CaptureStructure( decoding: UnsafeRawBufferPointer(serializedCaptures) ) else { - XCTFail("Malformed capture structure serialization") + XCTFail(""" + Malformed capture structure serialization + Captures: \(captures) + Serialization: \(Array(serializedCaptures)) + """) return } guard decodedCaptures == captures else { diff --git a/Tests/RegexTests/RegexDSLTests.swift b/Tests/RegexTests/RegexDSLTests.swift index 121ff2624..2a0d471e9 100644 --- a/Tests/RegexTests/RegexDSLTests.swift +++ b/Tests/RegexTests/RegexDSLTests.swift @@ -12,6 +12,12 @@ import XCTest @testable import _StringProcessing +func dynCap( + _ s: String, optional: Bool = false +) -> DynamicCapture { + DynamicCapture(s[...], numOptionals: optional ? 1 : 0) +} + class RegexDSLTests: XCTestCase { func _testDSLCaptures( _ tests: (input: String, expectedCaptures: CaptureType?)..., @@ -466,9 +472,10 @@ class RegexDSLTests: XCTestCase { let regex = try Regex("aabcc.") let line = "aabccd" let captures = try XCTUnwrap(line.match(regex)?.1) - XCTAssertEqual(captures, .empty) + XCTAssertEqual(captures, []) } do { + let regex = try Regex( #"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#) let line = """ @@ -478,10 +485,11 @@ class RegexDSLTests: XCTestCase { let captures = try XCTUnwrap(line.match(regex)?.1) XCTAssertEqual( captures, - .tuple([ - .substring("A6F0"), - .optional(.substring("A6F1")), - .substring("Extend")])) + [ + dynCap("A6F0"), + dynCap("A6F1", optional: true), + dynCap("Extend"), + ]) } } } From 4f0f195092e6b64e7cc934334fee4341ecc58f35 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 23 Feb 2022 09:13:59 -0700 Subject: [PATCH 2/3] wip: more on structured capture storage --- Sources/_StringProcessing/Capture.swift | 14 ++++++------ .../Engine/Structuralize.swift | 14 ++++++------ .../RegexDSL/DynamicCaptures.swift | 21 +++++++++--------- .../_StringProcessing/RegexDSL/Match.swift | 11 ++++++---- Tests/RegexTests/CaptureTests.swift | 22 +++++++++---------- Tests/RegexTests/RegexDSLTests.swift | 15 +++++++++++-- 6 files changed, 56 insertions(+), 41 deletions(-) diff --git a/Sources/_StringProcessing/Capture.swift b/Sources/_StringProcessing/Capture.swift index c9b75cdfd..fad842f9b 100644 --- a/Sources/_StringProcessing/Capture.swift +++ b/Sources/_StringProcessing/Capture.swift @@ -14,12 +14,12 @@ import _MatchingEngine /// A structured capture struct StructuredCapture { /// The `.optional` height of the result - var numOptionals = 0 + var optionalCount = 0 var storedCapture: StoredCapture? - var numSomes: Int { - storedCapture == nil ? numOptionals - 1 : numOptionals + var someCount: Int { + storedCapture == nil ? optionalCount - 1 : optionalCount } } @@ -33,7 +33,7 @@ struct StoredCapture { } extension StructuredCapture { - func extractExistentialMatchComponent( + func existentialMatchComponent( from input: Substring ) -> Any { var underlying: Any @@ -43,7 +43,7 @@ extension StructuredCapture { // Ok since we Any-box every step up the ladder underlying = Optional(nil) as Any } - for _ in 0.. Any { var caps = Array() caps.append(input) caps.append(contentsOf: self.map { - $0.extractExistentialMatchComponent(from: input) + $0.existentialMatchComponent(from: input) }) return TypeConstruction.tuple(of: caps) } diff --git a/Sources/_StringProcessing/Engine/Structuralize.swift b/Sources/_StringProcessing/Engine/Structuralize.swift index 7410f69fb..39fb25213 100644 --- a/Sources/_StringProcessing/Engine/Structuralize.swift +++ b/Sources/_StringProcessing/Engine/Structuralize.swift @@ -1,11 +1,11 @@ import _MatchingEngine extension CaptureStructure { - var numOptionals: Int { + var optionalCount: Int { switch self { case .atom: return 0 case .optional(let o): - return 1 + o.numOptionals + return 1 + o.optionalCount case .tuple: // FIXME: Separate CaptureStructure and a component fatalError("Recursive nesting") @@ -27,7 +27,7 @@ extension CaptureStructure { ) -> StructuredCapture { // TODO: CaptureList perhaps should store a // metatype or relevant info... - let numOptionals = cap.numOptionals + let optCount = cap.optionalCount if cap.atomType.base == Substring.self { // FIXME: What if a typed capture is Substring? @@ -35,12 +35,12 @@ extension CaptureStructure { if let r = storedCap.latest { return StructuredCapture( - numOptionals: numOptionals, + optionalCount: optCount, storedCapture: StoredCapture(range: r)) } return StructuredCapture( - numOptionals: numOptionals, + optionalCount: optCount, storedCapture: nil) } @@ -53,11 +53,11 @@ extension CaptureStructure { if let v = storedCap.latestValue { return StructuredCapture( - numOptionals: numOptionals, + optionalCount: optCount, storedCapture: StoredCapture(range: storedCap.latest, value: v)) } return StructuredCapture( - numOptionals: numOptionals, + optionalCount: optCount, storedCapture: nil) } diff --git a/Sources/_StringProcessing/RegexDSL/DynamicCaptures.swift b/Sources/_StringProcessing/RegexDSL/DynamicCaptures.swift index fbe180264..f498fdeb2 100644 --- a/Sources/_StringProcessing/RegexDSL/DynamicCaptures.swift +++ b/Sources/_StringProcessing/RegexDSL/DynamicCaptures.swift @@ -17,24 +17,29 @@ extension Regex where Match == (Substring, DynamicCaptures) { } } -public struct DynamicCapture: Hashable { - var numOptionals = 0 +// TODO: Empty token type rather than also having storage +public struct DynamicCaptures { + var contents: [StoredDynamicCapture] +} + +struct StoredDynamicCapture: Hashable { + var optionalCount = 0 // TODO: replace with a range var slice: Substring? - init(_ slice: Substring?, numOptionals: Int) { + init(_ slice: Substring?, optionalCount: Int) { self.slice = slice - self.numOptionals = numOptionals + self.optionalCount = optionalCount } } -extension DynamicCapture { +extension StoredDynamicCapture { init( _ cap: StructuredCapture, in input: String ) { - self.numOptionals = cap.numOptionals + self.optionalCount = cap.optionalCount guard let stored = cap.storedCapture else { self.slice = nil return @@ -46,7 +51,3 @@ extension DynamicCapture { self.slice = input[r] } } - -// TODO: Probably worth a separate type -public typealias DynamicCaptures = Array - diff --git a/Sources/_StringProcessing/RegexDSL/Match.swift b/Sources/_StringProcessing/RegexDSL/Match.swift index 5a8ce37ed..c3856cfbf 100644 --- a/Sources/_StringProcessing/RegexDSL/Match.swift +++ b/Sources/_StringProcessing/RegexDSL/Match.swift @@ -12,6 +12,8 @@ @dynamicMemberLookup public struct RegexMatch { public let range: Range + + // FIXME: Computed instead of stored public let match: Match public subscript(dynamicMember keyPath: KeyPath) -> T { @@ -50,15 +52,16 @@ extension RegexProtocol { } let convertedMatch: Match if Match.self == (Substring, DynamicCaptures).self { - let dynCaps = captures.map { - DynamicCapture($0, in: input) - } + let dynCaps = DynamicCaptures(contents: captures.map { + StoredDynamicCapture($0, in: input) + }) convertedMatch = (input[range], dynCaps) as! Match } else if Match.self == Substring.self { convertedMatch = input[range] as! Match } else { - let typeErasedMatch = captures.extractExistentialMatch( + // FIXME: Defer construction until accessed + let typeErasedMatch = captures.existentialMatch( from: input[range] ) convertedMatch = typeErasedMatch as! Match diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 88cbe59cf..9f3cc313b 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -5,13 +5,13 @@ import _MatchingEngine extension StructuredCapture { func formatStringCapture(input: String) -> String { - var res = String(repeating: "some(", count: numSomes) + var res = String(repeating: "some(", count: someCount) if let r = self.storedCapture?.range { res += input[r] } else { res += "none" } - res += String(repeating: ")", count: numSomes) + res += String(repeating: ")", count: someCount) return res } } @@ -29,37 +29,37 @@ extension Sequence where Element == StructuredCapture { struct StringCapture { var contents: String? - var numOptionals: Int + var optionalCount: Int - var numSomes: Int { - contents == nil ? numOptionals - 1 : numOptionals + var someCount: Int { + contents == nil ? optionalCount - 1 : optionalCount } static var none: Self { - self.init(contents: nil, numOptionals: 1) + self.init(contents: nil, optionalCount: 1) } static func some(_ s: Self) -> Self { self.init( - contents: s.contents, numOptionals: s.numOptionals+1) + contents: s.contents, optionalCount: s.optionalCount+1) } } extension StringCapture: ExpressibleByStringLiteral { init(stringLiteral: String) { self.contents = stringLiteral - self.numOptionals = 0 + self.optionalCount = 0 } } extension StringCapture: CustomStringConvertible { var description: String { - var res = String(repeating: "some(", count: numSomes) + var res = String(repeating: "some(", count: someCount) if let s = self.contents { res += s } else { res += "none" } - res += String(repeating: ")", count: numSomes) + res += String(repeating: ")", count: someCount) return res } } @@ -69,7 +69,7 @@ extension StringCapture { to structCap: StructuredCapture, in input: String ) -> Bool { - guard numOptionals == structCap.numOptionals else { + guard optionalCount == structCap.optionalCount else { return false } guard let r = structCap.storedCapture?.range else { diff --git a/Tests/RegexTests/RegexDSLTests.swift b/Tests/RegexTests/RegexDSLTests.swift index 2a0d471e9..b678f83e7 100644 --- a/Tests/RegexTests/RegexDSLTests.swift +++ b/Tests/RegexTests/RegexDSLTests.swift @@ -14,8 +14,19 @@ import XCTest func dynCap( _ s: String, optional: Bool = false -) -> DynamicCapture { - DynamicCapture(s[...], numOptionals: optional ? 1 : 0) +) -> StoredDynamicCapture { + StoredDynamicCapture(s[...], optionalCount: optional ? 1 : 0) +} + +extension DynamicCaptures: ExpressibleByArrayLiteral { + public init(arrayLiteral elements: StoredDynamicCapture...) { + self.init(contents: elements) + } +} +extension DynamicCaptures: Equatable { + public static func == (lhs: DynamicCaptures, rhs: DynamicCaptures) -> Bool { + lhs.contents == rhs.contents + } } class RegexDSLTests: XCTestCase { From 2b452032c53da1004d735bd56b05a50a883a5574 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Thu, 24 Feb 2022 10:22:08 -0700 Subject: [PATCH 3/3] mock up API for dynamic captures --- Sources/_StringProcessing/Capture.swift | 43 ++++++++++++++----- .../RegexDSL/DynamicCaptures.swift | 41 +++++++++++++++--- .../_StringProcessing/RegexDSL/Match.swift | 2 +- Tests/RegexTests/RegexDSLTests.swift | 11 ----- 4 files changed, 69 insertions(+), 28 deletions(-) diff --git a/Sources/_StringProcessing/Capture.swift b/Sources/_StringProcessing/Capture.swift index fad842f9b..7706cc49f 100644 --- a/Sources/_StringProcessing/Capture.swift +++ b/Sources/_StringProcessing/Capture.swift @@ -32,21 +32,42 @@ struct StoredCapture { var value: Any? = nil } +// TODO: Where should this live? Inside TypeConstruction? +func constructExistentialMatchComponent( + from input: Substring, + in range: Range?, + value: Any?, + optionalCount: Int +) -> Any { + let someCount: Int + var underlying: Any + if let v = value { + underlying = v + someCount = optionalCount + } else if let r = range { + underlying = input[r] + someCount = optionalCount + } else { + // Ok since we Any-box every step up the ladder + underlying = Optional(nil) as Any + someCount = optionalCount - 1 + } + + for _ in 0.. Any { - var underlying: Any - if let cap = self.storedCapture { - underlying = cap.value ?? input[cap.range!] - } else { - // Ok since we Any-box every step up the ladder - underlying = Optional(nil) as Any - } - for _ in 0.. -struct StoredDynamicCapture: Hashable { +// FIXME: Make this internal when we have API types or otherwise +// disentagle storage from API. In the meantime, this will have +// the storage name _and_ provide the API. +public struct StoredDynamicCapture: Hashable { var optionalCount = 0 // TODO: replace with a range @@ -34,6 +36,35 @@ struct StoredDynamicCapture: Hashable { } } +extension StoredDynamicCapture { + // TODO: How should we expose optional nesting? + + public var range: Range? { + guard let s = slice else { + return nil + } + return s.startIndex.. Bool { - lhs.contents == rhs.contents - } -} - class RegexDSLTests: XCTestCase { func _testDSLCaptures( _ tests: (input: String, expectedCaptures: CaptureType?)...,