From 55976c7713f8eb308ad5025743edf102120c190a Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Sun, 25 Feb 2024 13:36:45 -0600 Subject: [PATCH 1/3] Allow captures during lookahead This fixes an issue where capture groups inside a positive lookahead were being reset even upon successful matching of the lookahead. For example, with the pattern `/(?=(\d))/`, matching against a string like `"abc1"` should result in the output `("", "1")`. However, accessing the output traps instead, since the range data for capture 1 is missing even on success. This change resolves the issue by adding a boolean payload to the `fail` instruction that indicates whether to preserve captures when resetting the matching state, which allows any captures inside a lookahead to persist after success. Fixes #713. --- Sources/_StringProcessing/ByteCodeGen.swift | 68 +++++++++++++------ .../Engine/InstPayload.swift | 8 +-- .../_StringProcessing/Engine/MEBuilder.swift | 4 +- .../_StringProcessing/Engine/MECapture.swift | 2 +- .../_StringProcessing/Engine/Processor.swift | 17 ++++- Tests/RegexBuilderTests/AlgorithmsTests.swift | 22 ++++++ Tests/RegexTests/MatchTests.swift | 21 +++++- 7 files changed, 108 insertions(+), 34 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 3cf6bd6f2..c9045da70 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -308,31 +308,48 @@ fileprivate extension Compiler.ByteCodeGen { try emitNode(node) } - mutating func emitLookaround( - _ kind: (forwards: Bool, positive: Bool), - _ child: DSLTree.Node - ) throws { - guard kind.forwards else { - throw Unsupported("backwards assertions") - } - - let positive = kind.positive + mutating func emitPositiveLookahead(_ child: DSLTree.Node) throws { /* save(restoringAt: success) save(restoringAt: intercept) // failure restores at intercept clearThrough(intercept) // remove intercept and any leftovers from - : - clearSavePoint // remove success - fail // positive->success, negative propagates + fail // ->success intercept: - : - clearSavePoint // remove success - fail // positive propagates, negative->success + clearSavePoint // remove success + fail // propagate failure success: ... */ + let intercept = builder.makeAddress() + let success = builder.makeAddress() + builder.buildSave(success) + builder.buildSave(intercept) + try emitNode(child) + builder.buildClearThrough(intercept) + builder.buildFail(preservingCaptures: true) // Lookahead succeeds here + + builder.label(intercept) + builder.buildClear() + builder.buildFail() + + builder.label(success) + } + + mutating func emitNegativeLookahead(_ child: DSLTree.Node) throws { + /* + save(restoringAt: success) + save(restoringAt: intercept) + // failure restores at intercept + clearThrough(intercept) // remove intercept and any leftovers from + clearSavePoint // remove success + fail // propagate failure + intercept: + fail // ->success + success: + ... + */ let intercept = builder.makeAddress() let success = builder.makeAddress() @@ -340,19 +357,28 @@ fileprivate extension Compiler.ByteCodeGen { builder.buildSave(intercept) try emitNode(child) builder.buildClearThrough(intercept) - if !positive { - builder.buildClear() - } + builder.buildClear() builder.buildFail() builder.label(intercept) - if positive { - builder.buildClear() - } builder.buildFail() builder.label(success) } + + mutating func emitLookaround( + _ kind: (forwards: Bool, positive: Bool), + _ child: DSLTree.Node + ) throws { + guard kind.forwards else { + throw Unsupported("backwards assertions") + } + if kind.positive { + try emitPositiveLookahead(child) + } else { + try emitNegativeLookahead(child) + } + } mutating func emitAtomicNoncapturingGroup( _ child: DSLTree.Node diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 5f038449c..452ed8bc3 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -211,11 +211,11 @@ extension Instruction.Payload { self.rawValue == 1 } - init(bool: BoolRegister) { - self.init(bool) + init(bool: Bool) { + self.init(bool ? 1 : 0, 0) } - var bool: BoolRegister { - interpret() + var boolPayload: Bool { + interpret(as: TypedInt.self) == 1 } init(element: ElementRegister, isCaseInsensitive: Bool) { diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 1f5611f98..844ead08e 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -146,8 +146,8 @@ extension MEProgram.Builder { instructions.append(.init(.clearThrough)) fixup(to: t) } - mutating func buildFail() { - instructions.append(.init(.fail)) + mutating func buildFail(preservingCaptures: Bool = false) { + instructions.append(.init(.fail, .init(bool: preservingCaptures))) } mutating func buildAdvance(_ n: Distance) { diff --git a/Sources/_StringProcessing/Engine/MECapture.swift b/Sources/_StringProcessing/Engine/MECapture.swift index 9bb4ecb06..1964c9b8e 100644 --- a/Sources/_StringProcessing/Engine/MECapture.swift +++ b/Sources/_StringProcessing/Engine/MECapture.swift @@ -37,7 +37,7 @@ extension Processor { var value: Any? = nil // An in-progress capture start - fileprivate var currentCaptureBegin: Position? = nil + var currentCaptureBegin: Position? = nil fileprivate func _invariantCheck() { if range == nil { diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 0de7c2f70..2770e130d 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -331,7 +331,7 @@ extension Processor { return true } - mutating func signalFailure() { + mutating func signalFailure(preservingCaptures: Bool = false) { guard !savePoints.isEmpty else { state = .fail return @@ -362,10 +362,20 @@ extension Processor { controller.pc = pc currentPosition = pos ?? currentPosition callStack.removeLast(callStack.count - stackEnd.rawValue) - storedCaptures = capEnds registers.ints = intRegisters registers.positions = posRegisters + if preservingCaptures { + for i in capEnds.indices { + if storedCaptures[i].range == nil { + storedCaptures[i].currentCaptureBegin = capEnds[i].currentCaptureBegin + } + } + } else { + // Reset all capture information + storedCaptures = capEnds + } + metrics.addBacktrack() } @@ -479,7 +489,8 @@ extension Processor { tryAccept() case .fail: - signalFailure() + let preservingCaptures = payload.boolPayload + signalFailure(preservingCaptures: preservingCaptures) case .advance: let (isScalar, distance) = payload.distance diff --git a/Tests/RegexBuilderTests/AlgorithmsTests.swift b/Tests/RegexBuilderTests/AlgorithmsTests.swift index b85395d19..d9d27e7a1 100644 --- a/Tests/RegexBuilderTests/AlgorithmsTests.swift +++ b/Tests/RegexBuilderTests/AlgorithmsTests.swift @@ -256,6 +256,28 @@ class AlgorithmsResultBuilderTests: XCTestCase { "+" int } + + let ref1 = Reference() + let ref2 = Reference() + try expectMatch( + .first, + ("ABBAB", ("ABBAB", "A", "B")), + ("defABBAdefB", ("defABBAdefB", "A", "B")), + matchType: (Substring, Substring, Substring).self, + equivalence: == + ) { + Anchor.startOfSubject + Lookahead { + ZeroOrMore(.any) + Capture(as: ref1) { One(.any) } + Capture(as: ref2) { One(.any) } + ref2 + ref1 + } + OneOrMore(.any) + ref2 + Anchor.endOfSubject + } } func testStartsAndContains() throws { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 0de71f532..5ca2441ab 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1823,12 +1823,10 @@ extension RegexTests { (input: "abbba", match: nil), (input: "ABBA", match: nil), (input: "defABBAdef", match: nil)) - // FIXME: Backreferences don't escape positive lookaheads firstMatchTests( #"^(?=.*(.)(.)\2\1).+\2$"#, (input: "ABBAB", match: "ABBAB"), - (input: "defABBAdefB", match: "defABBAdefB"), - xfail: true) + (input: "defABBAdefB", match: "defABBAdefB")) firstMatchTests( #"^(?!.*(.)(.)\2\1).+$"#, @@ -2771,6 +2769,23 @@ extension RegexTests { } } + func testIssue713() throws { + // Original report from https://github.com/apple/swift-experimental-string-processing/issues/713 + let originalInput = "Something 9a" + let originalRegex = #/(?=([1-9]|(a|b)))/# + let originalOutput = originalInput.matches(of: originalRegex).map(\.output) + XCTAssert(originalOutput[0] == ("", "9", nil)) + XCTAssert(originalOutput[1] == ("", "a", "a")) + + let simplifiedRegex = #/(?=(9))/# + let simplifiedOutput = originalInput.matches(of: simplifiedRegex).map(\.output) + XCTAssert(simplifiedOutput[0] == ("", "9")) + + let additionalRegex = #/(a+)b(a+)/# + let additionalInput = "abaaba" + XCTAssertNil(additionalInput.wholeMatch(of: additionalRegex)) + } + func testNSRECompatibility() throws { // NSRE-compatibility includes scalar matching, so `[\r\n]` should match // either `\r` or `\n`. From 8215139d1b40ad1f971fb4f767a7655bbd527aa0 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Sat, 2 Mar 2024 00:42:47 -0600 Subject: [PATCH 2/3] Fix the same issue for captures in atomic groups --- Sources/_StringProcessing/ByteCodeGen.swift | 2 +- Tests/RegexTests/MatchTests.swift | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index c9045da70..29a1af269 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -403,7 +403,7 @@ fileprivate extension Compiler.ByteCodeGen { builder.buildSave(intercept) try emitNode(child) builder.buildClearThrough(intercept) - builder.buildFail() + builder.buildFail(preservingCaptures: true) // Atomic group succeeds here builder.label(intercept) builder.buildClear() diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 5ca2441ab..7788f9538 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1804,8 +1804,7 @@ extension RegexTests { firstMatchTests( #"(?>(\d+))\w+\1"#, (input: "23x23", match: "23x23"), - (input: "123x23", match: "23x23"), - xfail: true) + (input: "123x23", match: "23x23")) // Backreferences in scalar mode // In scalar mode the backreference should not match From 2f84e5645e512f4d0b7977f556bf15bb9194865c Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 8 Mar 2024 11:36:21 -0600 Subject: [PATCH 3/3] Address notes from @milseman --- Sources/_StringProcessing/ByteCodeGen.swift | 8 ++++---- Sources/_StringProcessing/Engine/Processor.swift | 8 +------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 29a1af269..00afca28b 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -313,8 +313,8 @@ fileprivate extension Compiler.ByteCodeGen { save(restoringAt: success) save(restoringAt: intercept) // failure restores at intercept - clearThrough(intercept) // remove intercept and any leftovers from - fail // ->success + clearThrough(intercept) // remove intercept and any leftovers from + fail(preservingCaptures: true) // ->success intercept: clearSavePoint // remove success fail // propagate failure @@ -387,8 +387,8 @@ fileprivate extension Compiler.ByteCodeGen { save(continuingAt: success) save(restoringAt: intercept) // failure restores at intercept - clearThrough(intercept) // remove intercept and any leftovers from - fail // ->success + clearThrough(intercept) // remove intercept and any leftovers from + fail(preservingCaptures: true) // ->success intercept: clearSavePoint // remove success fail // propagate failure diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 2770e130d..48c79fc8d 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -365,13 +365,7 @@ extension Processor { registers.ints = intRegisters registers.positions = posRegisters - if preservingCaptures { - for i in capEnds.indices { - if storedCaptures[i].range == nil { - storedCaptures[i].currentCaptureBegin = capEnds[i].currentCaptureBegin - } - } - } else { + if !preservingCaptures { // Reset all capture information storedCaptures = capEnds }