diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index da0888039..66fefc49e 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -471,6 +471,10 @@ fileprivate extension Compiler.ByteCodeGen { let minTrips = low assert((extraTrips ?? 1) >= 0) + if tryEmitFastQuant(child, updatedKind, minTrips, extraTrips) { + return + } + // The below is a general algorithm for bounded and unbounded // quantification. It can be specialized when the min // is 0 or 1, or when extra trips is 1 or unbounded. @@ -655,6 +659,80 @@ fileprivate extension Compiler.ByteCodeGen { builder.label(exit) } + /// Specialized quantification instruction for repetition of certain nodes in grapheme semantic mode + /// Allowed nodes are: + /// - single ascii scalar .char + /// - ascii .customCharacterClass + /// - single grapheme consumgin built in character classes + /// - .any, .anyNonNewline, .dot + mutating func tryEmitFastQuant( + _ child: DSLTree.Node, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) -> Bool { + guard optimizationsEnabled + && minTrips <= QuantifyPayload.maxStorableTrips + && extraTrips ?? 0 <= QuantifyPayload.maxStorableTrips + && options.semanticLevel == .graphemeCluster + && kind != .reluctant else { + return false + } + switch child { + case .customCharacterClass(let ccc): + // ascii only custom character class + guard let bitset = ccc.asAsciiBitset(options) else { + return false + } + builder.buildQuantify(bitset: bitset, kind, minTrips, extraTrips) + + case .atom(let atom): + switch atom { + case .char(let c): + // Single scalar ascii value character + guard let val = c._singleScalarAsciiValue else { + return false + } + builder.buildQuantify(asciiChar: val, kind, minTrips, extraTrips) + + case .any: + builder.buildQuantifyAny( + matchesNewlines: true, kind, minTrips, extraTrips) + case .anyNonNewline: + builder.buildQuantifyAny( + matchesNewlines: false, kind, minTrips, extraTrips) + case .dot: + builder.buildQuantifyAny( + matchesNewlines: options.dotMatchesNewline, kind, minTrips, extraTrips) + + case .characterClass(let cc): + // Custom character class that consumes a single grapheme + let model = cc.asRuntimeModel(options) + guard model.consumesSingleGrapheme else { + return false + } + builder.buildQuantify( + model: model, + kind, + minTrips, + extraTrips) + default: + return false + } + case .convertedRegexLiteral(let node, _): + return tryEmitFastQuant(node, kind, minTrips, extraTrips) + case .nonCapturingGroup(let groupKind, let node): + // .nonCapture nonCapturingGroups are ignored during compilation + guard groupKind.ast == .nonCapture else { + return false + } + return tryEmitFastQuant(node, kind, minTrips, extraTrips) + default: + return false + } + return true + } + /// Coalesce any adjacent scalar members in a custom character class together. /// This is required in order to produce correct grapheme matching behavior. func coalescingCustomCharacterClassMembers( diff --git a/Sources/_StringProcessing/Engine/Backtracking.swift b/Sources/_StringProcessing/Engine/Backtracking.swift index 355702ac1..3ebb060c9 100644 --- a/Sources/_StringProcessing/Engine/Backtracking.swift +++ b/Sources/_StringProcessing/Engine/Backtracking.swift @@ -10,14 +10,12 @@ //===----------------------------------------------------------------------===// extension Processor { - - // TODO: What all do we want to save? Configurable? - // TODO: Do we need to save any registers? - // TODO: Is this the right place to do function stack unwinding? struct SavePoint { var pc: InstructionAddress var pos: Position? - + // Quantifiers may store a range of positions to restore to + var rangeStart: Position? + var rangeEnd: Position? // The end of the call stack, so we can slice it off // when failing inside a call. // @@ -43,7 +41,35 @@ extension Processor { intRegisters: [Int], PositionRegister: [Input.Index] ) { - (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters) + return (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters) + } + + var rangeIsEmpty: Bool { rangeEnd == nil } + + mutating func updateRange(newEnd: Input.Index) { + if rangeStart == nil { + rangeStart = newEnd + } + rangeEnd = newEnd + } + + /// Move the next range position into pos, and removing it from the range + mutating func takePositionFromRange(_ input: Input) { + assert(!rangeIsEmpty) + pos = rangeEnd! + shrinkRange(input) + } + + /// Shrink the range of the save point by one index, essentially dropping the last index + mutating func shrinkRange(_ input: Input) { + assert(!rangeIsEmpty) + if rangeEnd == rangeStart { + // The range is now empty + rangeStart = nil + rangeEnd = nil + } else { + input.formIndex(before: &rangeEnd!) + } } } @@ -54,6 +80,21 @@ extension Processor { SavePoint( pc: pc, pos: addressOnly ? nil : currentPosition, + rangeStart: nil, + rangeEnd: nil, + stackEnd: .init(callStack.count), + captureEnds: storedCaptures, + intRegisters: registers.ints, + posRegisters: registers.positions) + } + + func startQuantifierSavePoint() -> SavePoint { + // Restores to the instruction AFTER the current quantifier instruction + SavePoint( + pc: controller.pc + 1, + pos: nil, + rangeStart: nil, + rangeEnd: nil, stackEnd: .init(callStack.count), captureEnds: storedCaptures, intRegisters: registers.ints, diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index d6372c0ba..1e2ed757b 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -9,6 +9,8 @@ // //===----------------------------------------------------------------------===// +@_implementationOnly import _RegexParser + extension Instruction { /// An instruction's payload packs operands and destination /// registers. @@ -330,7 +332,9 @@ extension Instruction.Payload { ) { interpretPair() } + // MARK: Struct payloads + init(_ model: _CharacterClassModel) { self.init(CharacterClassPayload(model).rawValue) } @@ -342,11 +346,169 @@ extension Instruction.Payload { self.init(rawValue: payload.rawValue) } var assertion: AssertionPayload { - AssertionPayload.init(rawValue: self.rawValue & _payloadMask) + AssertionPayload.init(rawValue: rawValue & _payloadMask) + } + init(quantify: QuantifyPayload) { + self.init(quantify.rawValue) + } + var quantify: QuantifyPayload { + return QuantifyPayload(rawValue: rawValue & _payloadMask) } } // MARK: Struct definitions +struct QuantifyPayload: RawRepresentable { + let rawValue: UInt64 + enum PayloadType: UInt64 { + case bitset = 0 + case asciiChar = 1 + case any = 2 + case builtin = 4 + } + + // Future work: optimize this layout -> payload type should be a fast switch + // The top 8 bits are reserved for the opcode so we have 56 bits to work with + // b55-b38 - Unused + // b38-b35 - Payload type (one of 4 types, stored on 3 bits) + // b35-b27 - minTrips (8 bit int) + // b27-b18 - extraTrips (8 bit value, one bit for nil) + // b18-b16 - Quantification type (one of three types) + // b16-b0 - Payload value (depends on payload type) + static var quantKindShift: UInt64 { 16 } + static var extraTripsShift: UInt64 { 18 } + static var minTripsShift: UInt64 { 27 } + static var typeShift: UInt64 { 35 } + static var maxStorableTrips: UInt64 { (1 << 8) - 1 } + + var quantKindMask: UInt64 { 3 } + var extraTripsMask: UInt64 { 0x1FF } + var minTripsMask: UInt64 { 0xFF } + var typeMask: UInt64 { 7 } + var payloadMask: UInt64 { 0xFF_FF } + + static func packInfoValues( + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int?, + _ type: PayloadType + ) -> UInt64 { + let kindVal: UInt64 + switch kind { + case .eager: + kindVal = 0 + case .reluctant: + kindVal = 1 + case .possessive: + kindVal = 2 + } + let extraTripsVal: UInt64 = extraTrips == nil ? 1 : UInt64(extraTrips!) << 1 + return (kindVal << QuantifyPayload.quantKindShift) + + (extraTripsVal << QuantifyPayload.extraTripsShift) + + (UInt64(minTrips) << QuantifyPayload.minTripsShift) + + (type.rawValue << QuantifyPayload.typeShift) + } + + init(rawValue: UInt64) { + self.rawValue = rawValue + assert(rawValue & _opcodeMask == 0) + } + + init( + bitset: AsciiBitsetRegister, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + assert(bitset.bits <= _payloadMask) + self.rawValue = bitset.bits + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .bitset) + } + + init( + asciiChar: UInt8, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + self.rawValue = UInt64(asciiChar) + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .asciiChar) + } + + init( + matchesNewlines: Bool, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + self.rawValue = (matchesNewlines ? 1 : 0) + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .any) + } + + init( + model: _CharacterClassModel, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + assert(model.cc.rawValue < 0xFF) + assert(model.matchLevel != .unicodeScalar) + let packedModel = model.cc.rawValue + + (model.isInverted ? 1 << 9 : 0) + + (model.isStrictASCII ? 1 << 10 : 0) + self.rawValue = packedModel + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .builtin) + } + + var type: PayloadType { + PayloadType(rawValue: (self.rawValue >> QuantifyPayload.typeShift) & 7)! + } + + var quantKind: AST.Quantification.Kind { + switch (self.rawValue >> QuantifyPayload.quantKindShift) & quantKindMask { + case 0: return .eager + case 1: return .reluctant + case 2: return .possessive + default: + fatalError("Unreachable") + } + } + + var minTrips: UInt64 { + (self.rawValue >> QuantifyPayload.minTripsShift) & minTripsMask + } + + var extraTrips: UInt64? { + let val = (self.rawValue >> QuantifyPayload.extraTripsShift) & extraTripsMask + if val == 1 { + return nil + } else { + return val >> 1 + } + } + + var bitset: AsciiBitsetRegister { + TypedInt(self.rawValue & payloadMask) + } + + var asciiChar: UInt8 { + UInt8(asserting: self.rawValue & payloadMask) + } + + var anyMatchesNewline: Bool { + (self.rawValue & 1) == 1 + } + + var builtin: _CharacterClassModel.Representation { + _CharacterClassModel.Representation(rawValue: self.rawValue & 0xFF)! + } + var builtinIsInverted: Bool { + (self.rawValue >> 9) & 1 == 1 + } + var builtinIsStrict: Bool { + (self.rawValue >> 10) & 1 == 1 + } +} + struct CharacterClassPayload: RawRepresentable { let rawValue: UInt64 // Layout: diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index f2ee88636..a41d2f4af 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -193,6 +193,13 @@ extension Instruction { /// case splitSaving + /// Fused quantify, execute, save instruction + /// Quantifies the stored instruction in an inner loop instead of looping through instructions in processor + /// Only quantifies specific nodes + /// + /// quantify(_:QuantifyPayload) + /// + case quantify /// Begin the given capture /// /// beginCapture(_:CapReg) diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 3406e9fed..959b1507e 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -201,6 +201,50 @@ extension MEProgram.Builder { .init(assertion: payload))) } + mutating func buildQuantify( + bitset: DSLTree.CustomCharacterClass.AsciiBitset, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + instructions.append(.init( + .quantify, + .init(quantify: .init(bitset: makeAsciiBitset(bitset), kind, minTrips, extraTrips)))) + } + + mutating func buildQuantify( + asciiChar: UInt8, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + instructions.append(.init( + .quantify, + .init(quantify: .init(asciiChar: asciiChar, kind, minTrips, extraTrips)))) + } + + mutating func buildQuantifyAny( + matchesNewlines: Bool, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + instructions.append(.init( + .quantify, + .init(quantify: .init(matchesNewlines: matchesNewlines, kind, minTrips, extraTrips)))) + } + + mutating func buildQuantify( + model: _CharacterClassModel, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + instructions.append(.init( + .quantify, + .init(quantify: .init(model: model,kind, minTrips, extraTrips)))) + } + mutating func buildAccept() { instructions.append(.init(.accept)) } diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index d05348893..36a6043fe 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -1,5 +1,4 @@ @_implementationOnly import _RegexParser // For AssertionKind - extension Character { var _isHorizontalWhitespace: Bool { self.unicodeScalars.first?.isHorizontalWhitespace == true @@ -16,10 +15,28 @@ extension Processor { _ isStrictASCII: Bool, _ isScalarSemantics: Bool ) -> Bool { - guard let char = load(), let scalar = loadScalar() else { + guard let next = _doMatchBuiltin( + cc, + isInverted, + isStrictASCII, + isScalarSemantics + ) else { signalFailure() return false } + currentPosition = next + return true + } + + func _doMatchBuiltin( + _ cc: _CharacterClassModel.Representation, + _ isInverted: Bool, + _ isStrictASCII: Bool, + _ isScalarSemantics: Bool + ) -> Input.Index? { + guard let char = load(), let scalar = loadScalar() else { + return nil + } let asciiCheck = (char.isASCII && !isScalarSemantics) || (scalar.isASCII && isScalarSemantics) @@ -95,12 +112,9 @@ extension Processor { } guard matched else { - signalFailure() - return false + return nil } - - currentPosition = next - return true + return next } func isAtStartOfLine(_ payload: AssertionPayload) -> Bool { @@ -185,6 +199,6 @@ extension Processor { } else { return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) } - } + } } } diff --git a/Sources/_StringProcessing/Engine/MECapture.swift b/Sources/_StringProcessing/Engine/MECapture.swift index 53243cd34..4bea21133 100644 --- a/Sources/_StringProcessing/Engine/MECapture.swift +++ b/Sources/_StringProcessing/Engine/MECapture.swift @@ -88,12 +88,6 @@ extension Processor { } } -extension Processor._StoredCapture: CustomStringConvertible { - var description: String { - return String(describing: self) - } -} - struct MECaptureList { var values: Array var referencedCaptureOffsets: [ReferenceID: Int] diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift new file mode 100644 index 000000000..9d17dc9bd --- /dev/null +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -0,0 +1,125 @@ +extension Processor { + func _doQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { + var next: Input.Index? + switch payload.type { + case .bitset: + next = _doMatchBitset(registers[payload.bitset]) + case .asciiChar: + next = _doMatchScalar( + UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true) + case .builtin: + // We only emit .quantify if it consumes a single character + next = _doMatchBuiltin( + payload.builtin, + payload.builtinIsInverted, + payload.builtinIsStrict, + false) + case .any: + let matched = currentPosition != input.endIndex + && (!input[currentPosition].isNewline || payload.anyMatchesNewline) + next = matched ? input.index(after: currentPosition) : nil + } + return next + } + + /// Generic quantify instruction interpreter + /// - Handles .eager and .posessive + /// - Handles arbitrary minTrips and extraTrips + mutating func runQuantify(_ payload: QuantifyPayload) -> Bool { + var trips = 0 + var extraTrips = payload.extraTrips + var savePoint = startQuantifierSavePoint() + + while true { + if trips >= payload.minTrips { + if extraTrips == 0 { break } + extraTrips = extraTrips.map({$0 - 1}) + if payload.quantKind == .eager { + savePoint.updateRange(newEnd: currentPosition) + } + } + let next = _doQuantifyMatch(payload) + guard let idx = next else { break } + currentPosition = idx + trips += 1 + } + + if trips < payload.minTrips { + signalFailure() + return false + } + + if payload.quantKind == .eager && !savePoint.rangeIsEmpty { + // The last save point has saved the current position, so it's unneeded + savePoint.shrinkRange(input) + if !savePoint.rangeIsEmpty { + savePoints.append(savePoint) + } + } + return true + } + + /// Specialized quantify instruction interpreter for * + mutating func runEagerZeroOrMoreQuantify(_ payload: QuantifyPayload) -> Bool { + assert(payload.quantKind == .eager + && payload.minTrips == 0 + && payload.extraTrips == nil) + var savePoint = startQuantifierSavePoint() + + while true { + savePoint.updateRange(newEnd: currentPosition) + let next = _doQuantifyMatch(payload) + guard let idx = next else { break } + currentPosition = idx + } + + // The last save point has saved the current position, so it's unneeded + savePoint.shrinkRange(input) + if !savePoint.rangeIsEmpty { + savePoints.append(savePoint) + } + return true + } + + /// Specialized quantify instruction interpreter for + + mutating func runEagerOneOrMoreQuantify(_ payload: QuantifyPayload) -> Bool { + assert(payload.quantKind == .eager + && payload.minTrips == 1 + && payload.extraTrips == nil) + var savePoint = startQuantifierSavePoint() + while true { + let next = _doQuantifyMatch(payload) + guard let idx = next else { break } + currentPosition = idx + savePoint.updateRange(newEnd: currentPosition) + } + + if savePoint.rangeIsEmpty { + signalFailure() + return false + } + // The last save point has saved the current position, so it's unneeded + savePoint.shrinkRange(input) + if !savePoint.rangeIsEmpty { + savePoints.append(savePoint) + } + return true + } + + /// Specialized quantify instruction interpreter for ? + mutating func runZeroOrOneQuantify(_ payload: QuantifyPayload) -> Bool { + assert(payload.minTrips == 0 + && payload.extraTrips == 1) + let next = _doQuantifyMatch(payload) + guard let idx = next else { + return true // matched zero times + } + if payload.quantKind != .possessive { + // Save the zero match + let savePoint = makeSavePoint(currentPC + 1) + savePoints.append(savePoint) + } + currentPosition = idx + return true + } +} diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 55ac49ed9..a62c1e070 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -244,18 +244,25 @@ extension Processor { currentPosition < end ? input.unicodeScalars[currentPosition] : nil } + func _doMatchScalar(_ s: Unicode.Scalar, _ boundaryCheck: Bool) -> Input.Index? { + if s == loadScalar(), + let idx = input.unicodeScalars.index( + currentPosition, + offsetBy: 1, + limitedBy: end), + (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) { + return idx + } else { + return nil + } + } + mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { - guard s == loadScalar(), - let idx = input.unicodeScalars.index( - currentPosition, - offsetBy: 1, - limitedBy: end), - (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) - else { + guard let next = _doMatchScalar(s, boundaryCheck) else { signalFailure() return false } - currentPosition = idx + currentPosition = next return true } @@ -278,17 +285,25 @@ extension Processor { return true } + func _doMatchBitset(_ bitset: DSLTree.CustomCharacterClass.AsciiBitset) -> Input.Index? { + if let cur = load(), bitset.matches(char: cur) { + return input.index(after: currentPosition) + } else { + return nil + } + } + // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset mutating func matchBitset( _ bitset: DSLTree.CustomCharacterClass.AsciiBitset ) -> Bool { - guard let cur = load(), bitset.matches(char: cur) else { + guard let next = _doMatchBitset(bitset) else { signalFailure() return false } - _uncheckedForcedConsumeOne() + currentPosition = next return true } @@ -297,7 +312,7 @@ extension Processor { _ bitset: DSLTree.CustomCharacterClass.AsciiBitset ) -> Bool { guard let curScalar = loadScalar(), - bitset.matches(scalar: curScalar), + bitset.matches(scalar: curScalar), let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end) else { signalFailure() return false @@ -307,12 +322,31 @@ extension Processor { } mutating func signalFailure() { - guard let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = - savePoints.popLast()?.destructure - else { + guard !savePoints.isEmpty else { state = .fail return } + let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters): ( + pc: InstructionAddress, + pos: Position?, + stackEnd: CallStackAddress, + captureEnds: [_StoredCapture], + intRegisters: [Int], + PositionRegister: [Input.Index] + ) + + let idx = savePoints.index(before: savePoints.endIndex) + // If we have a quantifier save point, move the next range position into pos + if !savePoints[idx].rangeIsEmpty { + savePoints[idx].takePositionFromRange(input) + } + // If we have a normal save point or an empty quantifier save point, remove it + if savePoints[idx].rangeIsEmpty { + (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoints.removeLast().destructure + } else { + (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoints[idx].destructure + } + assert(stackEnd.rawValue <= callStack.count) assert(capEnds.count == storedCaptures.count) @@ -366,7 +400,6 @@ extension Processor { _checkInvariants() } let (opcode, payload) = fetch().destructure - switch opcode { case .invalid: fatalError("Invalid program") @@ -487,6 +520,25 @@ extension Processor { ) { controller.step() } + case .quantify: + let quantPayload = payload.quantify + let matched: Bool + switch (quantPayload.quantKind, quantPayload.minTrips, quantPayload.extraTrips) { + case (.reluctant, _, _): + assertionFailure(".reluctant is not supported by .quantify") + return + case (.eager, 0, nil): + matched = runEagerZeroOrMoreQuantify(quantPayload) + case (.eager, 1, nil): + matched = runEagerOneOrMoreQuantify(quantPayload) + case (_, 0, 1): + matched = runZeroOrOneQuantify(quantPayload) + default: + matched = runQuantify(quantPayload) + } + if matched { + controller.step() + } case .consumeBy: let reg = payload.consumer @@ -590,5 +642,3 @@ extension Processor { } } } - - diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index 525beec63..cbb065fc1 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -59,7 +59,11 @@ extension Processor.SavePoint { if let p = self.pos { posStr = "\(input.distance(from: input.startIndex, to: p))" } else { - posStr = "" + if rangeIsEmpty { + posStr = "" + } else { + posStr = "\(rangeStart!...rangeEnd!)" + } } return """ pc: \(self.pc), pos: \(posStr), stackEnd: \(stackEnd) diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 3be26f27f..c5f1f8ecd 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -156,6 +156,15 @@ struct _CharacterClassModel: Hashable { } } +extension _CharacterClassModel { + var consumesSingleGrapheme: Bool { + switch self.cc { + case .anyScalar: return false + default: return true + } + } +} + extension _CharacterClassModel.Representation { /// Returns true if this CharacterClass should be matched by strict ascii under the given options func isStrictAscii(options: MatchingOptions) -> Bool { diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index e0702f87f..54fc3b561 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -47,6 +47,7 @@ enum DecodedInstr { case endCapture case transformCapture case captureValue + case quantify } extension DecodedInstr { @@ -120,6 +121,8 @@ extension DecodedInstr { return .assertBy case .matchBy: return .matchBy + case .quantify: + return .quantify case .backreference: return .backreference case .beginCapture: @@ -304,7 +307,7 @@ extension RegexTests { matchingOptions(adding: [.caseInsensitive])) } - private func expectProgram( + func expectProgram( for regex: String, syntax: SyntaxOptions = .traditional, semanticLevel: RegexSemanticLevel? = nil, diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index ff51088ff..794e57b16 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -2522,4 +2522,31 @@ extension RegexTests { expectCompletion(regex: #"(a{,4})*"#, in: "aa") expectCompletion(regex: #"((|)+)*"#, in: "aa") } + + func testQuantifyOptimization() throws { + // test that the maximum values for minTrips and extraTrips are handled correctly + let maxStorable = Int(QuantifyPayload.maxStorableTrips) + let maxExtraTrips = "a{,\(maxStorable)}" + expectProgram(for: maxExtraTrips, contains: [.quantify]) + firstMatchTest(maxExtraTrips, input: String(repeating: "a", count: maxStorable), match: String(repeating: "a", count: maxStorable)) + firstMatchTest(maxExtraTrips, input: String(repeating: "a", count: maxStorable + 1), match: String(repeating: "a", count: maxStorable)) + XCTAssertNil(try Regex(maxExtraTrips).wholeMatch(in: String(repeating: "a", count: maxStorable + 1))) + + let maxMinTrips = "a{\(maxStorable),}" + expectProgram(for: maxMinTrips, contains: [.quantify]) + firstMatchTest(maxMinTrips, input: String(repeating: "a", count: maxStorable), match: String(repeating: "a", count: maxStorable)) + firstMatchTest(maxMinTrips, input: String(repeating: "a", count: maxStorable - 1), match: nil) + + let maxBothTrips = "a{\(maxStorable),\(maxStorable*2)}" + expectProgram(for: maxBothTrips, contains: [.quantify]) + XCTAssertNil(try Regex(maxBothTrips).wholeMatch(in: String(repeating: "a", count: maxStorable*2 + 1))) + firstMatchTest(maxBothTrips, input: String(repeating: "a", count: maxStorable*2), match: String(repeating: "a", count: maxStorable*2)) + firstMatchTest(maxBothTrips, input: String(repeating: "a", count: maxStorable), match: String(repeating: "a", count: maxStorable)) + firstMatchTest(maxBothTrips, input: String(repeating: "a", count: maxStorable - 1), match: nil) + + expectProgram(for: "a{,\(maxStorable+1)}", doesNotContain: [.quantify]) + expectProgram(for: "a{\(maxStorable+1),}", doesNotContain: [.quantify]) + expectProgram(for: "a{\(maxStorable-1),\(maxStorable*2)}", doesNotContain: [.quantify]) + expectProgram(for: "a{\(maxStorable),\(maxStorable*2+1)}", doesNotContain: [.quantify]) + } }