diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 5fb8e89c..3820f504 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -43,9 +43,18 @@ extension Compiler { extension Compiler.ByteCodeGen { mutating func emitRoot(_ root: DSLTree.Node) throws -> MEProgram { - // The whole match (`.0` element of output) is equivalent to an implicit - // capture over the entire regex. - try emitNode(.capture(name: nil, reference: nil, root)) + // If the whole regex is a matcher, then the whole-match value + // is the constructed value. Denote that the current value + // register is the processor's value output. + switch root { + case .matcher: + builder.denoteCurrentValueIsWholeMatchValue() + default: + break + } + + try emitNode(root) + builder.canOnlyMatchAtStart = root.canOnlyMatchAtStart() builder.buildAccept() return try builder.assemble() @@ -149,8 +158,9 @@ fileprivate extension Compiler.ByteCodeGen { guard let i = n.value else { throw Unreachable("Expected a value") } + let cap = builder.captureRegister(forBackreference: i) builder.buildBackreference( - .init(i), isScalarMode: options.semanticLevel == .unicodeScalar) + cap, isScalarMode: options.semanticLevel == .unicodeScalar) case .named(let name): try builder.buildNamedReference( name, isScalarMode: options.semanticLevel == .unicodeScalar) diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index cd43dc76..c862cfae 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -33,10 +33,18 @@ extension MEProgram { // Registers var nextIntRegister = IntRegister(0) - var nextCaptureRegister = CaptureRegister(0) var nextValueRegister = ValueRegister(0) var nextPositionRegister = PositionRegister(0) + // Set to non-nil when a value register holds the whole-match + // value (i.e. when a regex consists entirely of a custom matcher) + var wholeMatchValue: ValueRegister? = nil + + // Note: Capture 0 (i.e. whole-match) is handled specially + // by the engine, so `n` here refers to the regex AST's `n+1` + // capture + var nextCaptureRegister = CaptureRegister(0) + // Special addresses or instructions var failAddressToken: AddressToken? = nil @@ -70,6 +78,24 @@ extension MEProgram.Builder { self.second = b } } + + // Maps the AST's named capture offset to a capture register + func captureRegister(named name: String) throws -> CaptureRegister { + guard let index = captureList.indexOfCapture(named: name) else { + throw RegexCompilationError.uncapturedReference + } + return .init(index - 1) + } + + // Map an AST's backreference number to a capture register + func captureRegister(forBackreference i: Int) -> CaptureRegister { + .init(i - 1) + } + + mutating func denoteCurrentValueIsWholeMatchValue() { + assert(wholeMatchValue == nil) + wholeMatchValue = nextValueRegister + } } extension MEProgram.Builder { @@ -337,10 +363,8 @@ extension MEProgram.Builder { } mutating func buildNamedReference(_ name: String, isScalarMode: Bool) throws { - guard let index = captureList.indexOfCapture(named: name) else { - throw RegexCompilationError.uncapturedReference - } - buildBackreference(.init(index), isScalarMode: isScalarMode) + let cap = try captureRegister(named: name) + buildBackreference(cap, isScalarMode: isScalarMode) } // TODO: Mutating because of fail address fixup, drop when @@ -401,6 +425,7 @@ extension MEProgram.Builder { regInfo.transformFunctions = transformFunctions.count regInfo.matcherFunctions = matcherFunctions.count regInfo.captures = nextCaptureRegister.rawValue + regInfo.wholeMatchValue = wholeMatchValue?.rawValue return MEProgram( instructions: InstructionList(instructions), @@ -514,8 +539,8 @@ extension MEProgram.Builder { assert(preexistingValue == nil) } if let name = name { - let index = captureList.indexOfCapture(named: name) - assert(index == nextCaptureRegister.rawValue) + let cap = try? captureRegister(named: name) + assert(cap == nextCaptureRegister) } assert(nextCaptureRegister.rawValue < captureList.captures.count) return nextCaptureRegister diff --git a/Sources/_StringProcessing/Engine/MECapture.swift b/Sources/_StringProcessing/Engine/MECapture.swift index 3dfda6b9..e18365d6 100644 --- a/Sources/_StringProcessing/Engine/MECapture.swift +++ b/Sources/_StringProcessing/Engine/MECapture.swift @@ -84,17 +84,3 @@ extension Processor { } } } - -struct MECaptureList { - var values: Array - var referencedCaptureOffsets: [ReferenceID: Int] - - func latestUntyped(from input: String) -> Array { - values.map { - guard let range = $0.range else { - return nil - } - return input[range] - } - } -} diff --git a/Sources/_StringProcessing/Engine/Registers.swift b/Sources/_StringProcessing/Engine/Registers.swift index 43fb0b8d..eb600e97 100644 --- a/Sources/_StringProcessing/Engine/Registers.swift +++ b/Sources/_StringProcessing/Engine/Registers.swift @@ -172,6 +172,10 @@ extension MEProgram { var positionStackAddresses = 0 var savePointAddresses = 0 var captures = 0 + + // The value register holding the whole-match value, if there + // is one + var wholeMatchValue: Int? = nil } } diff --git a/Sources/_StringProcessing/Engine/Structuralize.swift b/Sources/_StringProcessing/Engine/Structuralize.swift index df109083..d91d0f1a 100644 --- a/Sources/_StringProcessing/Engine/Structuralize.swift +++ b/Sources/_StringProcessing/Engine/Structuralize.swift @@ -1,20 +1,36 @@ internal import _RegexParser -extension CaptureList { - @available(SwiftStdlib 5.7, *) - func createElements( - _ list: MECaptureList +@available(SwiftStdlib 5.7, *) +extension Executor { + static func createExistentialElements( + _ program: MEProgram, + matchRange: Range, + storedCaptures: [Processor._StoredCapture], + wholeMatchValue: Any? ) -> [AnyRegexOutput.ElementRepresentation] { - assert(list.values.count == captures.count) - + let capList = program.captureList + let capOffsets = program.referencedCaptureOffsets + + // Formal captures include the entire match + assert(storedCaptures.count + 1 == capList.captures.count) + var result = [AnyRegexOutput.ElementRepresentation]() - - for (i, (cap, meStored)) in zip(captures, list.values).enumerated() { + result.reserveCapacity(1 + capList.captures.count) + result.append( + AnyRegexOutput.ElementRepresentation( + optionalDepth: 0, + content: (matchRange, wholeMatchValue), + visibleInTypedOutput: capList.captures[0].visibleInTypedOutput) + ) + + for (i, (cap, meStored)) in zip( + capList.captures.dropFirst(), storedCaptures + ).enumerated() { let element = AnyRegexOutput.ElementRepresentation( optionalDepth: cap.optionalDepth, content: meStored.deconstructed, name: cap.name, - referenceID: list.referencedCaptureOffsets.first { $1 == i }?.key, + referenceID: capOffsets.first { $1 == i }?.key, visibleInTypedOutput: cap.visibleInTypedOutput ) diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 6befcdbc..46c03eb1 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -190,15 +190,22 @@ extension Executor { guard let endIdx = try cpu.run() else { return nil } - let capList = MECaptureList( - values: cpu.storedCaptures, - referencedCaptureOffsets: program.referencedCaptureOffsets) - let range = startPosition..