From 3b6b676688f3bd63b7323325e4af16525615edf6 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 5 Jul 2022 14:20:36 -0700 Subject: [PATCH 01/35] Copy over new ascii bitset --- .../_StringProcessing/ConsumerInterface.swift | 7 ++ Sources/_StringProcessing/Regex/DSLTree.swift | 80 --------------- .../Utility/AsciiBitset.swift | 99 +++++++++++++++++++ 3 files changed, 106 insertions(+), 80 deletions(-) create mode 100644 Sources/_StringProcessing/Utility/AsciiBitset.swift diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index af46b5381..e4304fa6f 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -11,6 +11,13 @@ @_implementationOnly import _RegexParser +extension Character { + var _singleScalarAsciiValue: UInt8? { + guard self != "\r\n" else { return nil } + return asciiValue + } +} + extension DSLTree.Node { /// Attempt to generate a consumer from this AST node /// diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 0714c5d2c..ab6c25a59 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -162,86 +162,6 @@ extension DSLTree { indirect case subtraction(CustomCharacterClass, CustomCharacterClass) indirect case symmetricDifference(CustomCharacterClass, CustomCharacterClass) } - - internal struct AsciiBitset { - let isInverted: Bool - var a: UInt64 = 0 - var b: UInt64 = 0 - - init(isInverted: Bool) { - self.isInverted = isInverted - } - - init(_ val: UInt8, _ isInverted: Bool, _ isCaseInsensitive: Bool) { - self.isInverted = isInverted - add(val, isCaseInsensitive) - } - - init(low: UInt8, high: UInt8, isInverted: Bool, isCaseInsensitive: Bool) { - self.isInverted = isInverted - for val in low...high { - add(val, isCaseInsensitive) - } - } - - internal init( - a: UInt64, - b: UInt64, - isInverted: Bool - ) { - self.isInverted = isInverted - self.a = a - self.b = b - } - - internal mutating func add(_ val: UInt8, _ isCaseInsensitive: Bool) { - setBit(val) - if isCaseInsensitive { - switch val { - case 64...90: setBit(val + 32) - case 97...122: setBit(val - 32) - default: break - } - } - } - - internal mutating func setBit(_ val: UInt8) { - if val < 64 { - a = a | 1 << val - } else { - b = b | 1 << (val - 64) - } - } - - internal func matches(char: Character) -> Bool { - let ret: Bool - if let val = char.asciiValue { - if val < 64 { - ret = (a >> val) & 1 == 1 - } else { - ret = (b >> (val - 64)) & 1 == 1 - } - } else { - ret = false - } - - if isInverted { - return !ret - } - - return ret - } - - /// Joins another bitset from a Member of the same CustomCharacterClass - internal func union(_ other: AsciiBitset) -> AsciiBitset { - precondition(self.isInverted == other.isInverted) - return AsciiBitset( - a: self.a | other.a, - b: self.b | other.b, - isInverted: self.isInverted - ) - } - } } @_spi(RegexBuilder) diff --git a/Sources/_StringProcessing/Utility/AsciiBitset.swift b/Sources/_StringProcessing/Utility/AsciiBitset.swift new file mode 100644 index 000000000..ad3159820 --- /dev/null +++ b/Sources/_StringProcessing/Utility/AsciiBitset.swift @@ -0,0 +1,99 @@ +extension DSLTree.CustomCharacterClass { + internal struct AsciiBitset { + let isInverted: Bool + var a: UInt64 = 0 + var b: UInt64 = 0 + + init(isInverted: Bool) { + self.isInverted = isInverted + } + + init(_ val: UInt8, _ isInverted: Bool, _ isCaseInsensitive: Bool) { + self.isInverted = isInverted + add(val, isCaseInsensitive) + } + + init(low: UInt8, high: UInt8, isInverted: Bool, isCaseInsensitive: Bool) { + self.isInverted = isInverted + for val in low...high { + add(val, isCaseInsensitive) + } + } + + internal init( + a: UInt64, + b: UInt64, + isInverted: Bool + ) { + self.isInverted = isInverted + self.a = a + self.b = b + } + + internal mutating func add(_ val: UInt8, _ isCaseInsensitive: Bool) { + setBit(val) + if isCaseInsensitive { + switch val { + case 64...90: setBit(val + 32) + case 97...122: setBit(val - 32) + default: break + } + } + } + + internal mutating func setBit(_ val: UInt8) { + if val < 64 { + a = a | 1 << val + } else { + b = b | 1 << (val - 64) + } + } + + private func matches(_ val: UInt8) -> Bool { + if val < 64 { + return (a >> val) & 1 == 1 + } else { + return (b >> (val - 64)) & 1 == 1 + } + } + + internal func matches(char: Character) -> Bool { + let matched: Bool + if let val = char._singleScalarAsciiValue { + matched = matches(val) + } else { + matched = false + } + + if isInverted { + return !matched + } + return matched + } + + internal func matches(scalar: Unicode.Scalar) -> Bool { + let matched: Bool + if scalar.isASCII { + let val = UInt8(ascii: scalar) + matched = matches(val) + } else { + matched = false + } + + if isInverted { + return !matched + } + return matched + } + + /// Joins another bitset from a Member of the same CustomCharacterClass + internal func union(_ other: AsciiBitset) -> AsciiBitset { + precondition(self.isInverted == other.isInverted) + return AsciiBitset( + a: self.a | other.a, + b: self.b | other.b, + isInverted: self.isInverted + ) + } + } +} From 33caa79f06fe6a8b4690c351d913b55359166825 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 5 Jul 2022 14:21:39 -0700 Subject: [PATCH 02/35] Add matchBuiltin --- Sources/_StringProcessing/ByteCodeGen.swift | 15 ++ Sources/_StringProcessing/Compiler.swift | 3 +- .../Engine/InstPayload.swift | 16 ++ .../Engine/Instruction.swift | 2 +- .../_StringProcessing/Engine/MEBuilder.swift | 10 ++ .../_StringProcessing/Engine/MEBuiltins.swift | 4 - .../_StringProcessing/Engine/Processor.swift | 145 ++++++++++++++++-- .../_CharacterClassModel.swift | 92 +++++++++++ 8 files changed, 270 insertions(+), 17 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 820a4c721..27607706b 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -66,6 +66,11 @@ fileprivate extension Compiler.ByteCodeGen { options.apply(optionSequence.ast) case let .unconverted(astAtom): + if optimizationsEnabled, + let cc = astAtom.ast.characterClass?.builtinCC { + emitBuiltinCharacterClass(cc) + return + } if let consumer = try astAtom.ast.generateConsumer(options) { builder.buildConsume(by: consumer) } else { @@ -95,6 +100,16 @@ fileprivate extension Compiler.ByteCodeGen { throw Unsupported("Backreference kind: \(ref)") } } + + mutating func emitBuiltinCharacterClass( + _ cc: BuiltinCC + ) { + builder.buildMatchBuiltin( + cc, + cc.isStrict(options: options), + cc.asciiBitset, + isScalar: options.semanticLevel == .unicodeScalar) + } mutating func emitAssertion( _ kind: AST.Atom.AssertionKind diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index f47898e4e..88e2ce8e3 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -64,7 +64,7 @@ func _compileRegex( ) throws -> Executor { let ast = try parse(regex, syntax) let dsl: DSLTree - + print(ast) switch semanticLevel?.base { case .graphemeCluster: let sequence = AST.MatchingOptionSequence(adding: [.init(.graphemeClusterSemantics, location: .fake)]) @@ -75,6 +75,7 @@ func _compileRegex( case .none: dsl = ast.dslTree } + print(dsl) let program = try Compiler(tree: dsl).emit() return Executor(program: program) } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index c614e10fd..deabf7231 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -203,6 +203,22 @@ extension Instruction.Payload { var bitset: AsciiBitsetRegister { interpret() } + + init(_ cc: BuiltinCC, _ isStrict: Bool, _ isScalar: Bool, bitset: AsciiBitsetRegister) { + let strictBit = isStrict ? 1 << 15 : 0 + let scalarBit = isScalar ? 1 << 14 : 0 + // val must be 16 bits, reserve the top 2 bits for if it is strict ascii or scalar + assert(cc.rawValue <= 0x3F_FF) + let val = cc.rawValue + UInt64(strictBit) + UInt64(scalarBit) + self.init(val, bitset) + } + var builtinCCPayload: (cc: BuiltinCC, isStrict: Bool, isScalar: Bool, bitset: AsciiBitsetRegister) { + let (val, bitset): (UInt64, AsciiBitsetRegister) = self.interpretPair() + let cc = BuiltinCC(rawValue: val & 0x3F_FF)! + let isStrict = (val >> 15) & 1 == 1 + let isScalar = (val >> 14) & 1 == 1 + return (cc, isStrict, isScalar, bitset) + } init(consumer: ConsumeFunctionRegister) { self.init(consumer) diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 4e715ad9d..5e5d51acf 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -92,7 +92,7 @@ extension Instruction { case builtinAssertion /// TODO: builtin character classes - case builtinCharacterClass + case matchBuiltin // MARK: Extension points diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 676b21473..e1326fbd9 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -154,6 +154,16 @@ extension MEProgram.Builder { instructions.append(.init( .matchBitset, .init(bitset: makeAsciiBitset(b)))) } + + mutating func buildMatchBuiltin( + _ cc: BuiltinCC, + _ isStrict: Bool, + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, + isScalar: Bool + ) { + instructions.append(.init( + .matchBuiltin, .init(cc, isStrict, isScalar, bitset: makeAsciiBitset(bitset)))) + } mutating func buildConsume( by p: @escaping MEProgram.ConsumeFunction diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index f791da37e..720a41618 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -6,8 +6,4 @@ extension Processor { mutating func builtinAssertion() { fatalError("TODO: assertions and anchors") } - - mutating func builtinCharacterClass() { - fatalError("TODO: character classes") - } } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index a5368138c..05f91ae35 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -226,7 +226,11 @@ extension Processor { } return true } - + + func loadScalar() -> Unicode.Scalar? { + currentPosition < end ? input.unicodeScalars[currentPosition] : nil + } + // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset @@ -240,6 +244,117 @@ extension Processor { _uncheckedForcedConsumeOne() return true } + + mutating func matchBuiltin( + _ cc: BuiltinCC, + _ isStrictAscii: Bool, + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + ) -> Bool { + guard let c = load() else { + signalFailure() + return false + } + + // Fast path: See if c is a single scalar ascii character + // If so, and it matches, consume a character + // Note: CR-LF will fall through because it is not a single scalar + if bitset.matches(char: c) && cc != .anyScalar { + _uncheckedForcedConsumeOne() + return true + } + + // Slow path: Do full match + var matched: Bool + var next = input.index(after: currentPosition) + switch cc { + // lily note: when do these `any` cases appear? can they be compiled + // into consume instructions at compile time? + case .any, .anyGrapheme: matched = true + case .anyScalar: + matched = true + next = input.unicodeScalars.index(after: currentPosition) + case .digit: + matched = c.isNumber && (c.isASCII || !isStrictAscii) + case .hexDigit: + matched = c.isHexDigit && (c.isASCII || !isStrictAscii) + case .horizontalWhitespace: + matched = c.unicodeScalars.first?.isHorizontalWhitespace == true + && (c.isASCII || !isStrictAscii) + case .newlineSequence, .verticalWhitespace: + matched = c.unicodeScalars.first?.isNewline == true + && (c.isASCII || !isStrictAscii) + case .whitespace: + matched = c.isWhitespace && (c.isASCII || !isStrictAscii) + case .word: + matched = c.isWordCharacter && (c.isASCII || !isStrictAscii) + } + + if matched { + currentPosition = next + return true + } else { + signalFailure() + return false + } + } + + mutating func matchBuiltinScalar( + _ cc: BuiltinCC, + _ isStrictAscii: Bool, + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + ) -> Bool { + guard let c = loadScalar() else { + signalFailure() + return false + } + + // Fast path: See if c is a single scalar ascii character + // If so, and it matches, consume a character + // Note: CR-LF must be matched fully if we are matching a .newlineSequence + // so exclude "\r" from the fast path + if bitset.matches(scalar: c) && cc != .anyGrapheme && c != "\r" { + input.unicodeScalars.formIndex(after: ¤tPosition) + return true + } + + // Slow path: Do full match + var matched: Bool + var next = input.unicodeScalars.index(after: currentPosition) + switch cc { + case .any: matched = true + case .anyScalar: matched = true + case .anyGrapheme: + matched = true + next = input.index(after: currentPosition) + case .digit: + matched = c.properties.numericType != nil && (c.isASCII || !isStrictAscii) + case .hexDigit: + matched = Character(c).isHexDigit && (c.isASCII || !isStrictAscii) + case .horizontalWhitespace: + matched = c.isHorizontalWhitespace && (c.isASCII || !isStrictAscii) + case .verticalWhitespace: + matched = c.isNewline && (c.isASCII || !isStrictAscii) + case .newlineSequence: + matched = c.isNewline && (c.isASCII || !isStrictAscii) + // lily note: what exactly is this doing? matching a full cr-lf character + // even though its in scalar mode? why? + if c == "\r" && next != input.endIndex && input.unicodeScalars[next] == "\n" { + input.unicodeScalars.formIndex(after: &next) + } + case .whitespace: + matched = c.properties.isWhitespace && (c.isASCII || !isStrictAscii) + case .word: + matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !isStrictAscii) + } + + if matched { + currentPosition = next + return true + } else { + signalFailure() + return false + } + } mutating func signalFailure() { guard let (pc, pos, stackEnd, capEnds, intRegisters) = @@ -385,6 +500,19 @@ extension Processor { controller.step() } + case .matchBuiltin: + let (cc, isStrict, isScalar, reg) = payload.builtinCCPayload + let bitset = registers[reg] + if isScalar { + if matchBuiltinScalar(cc, isStrict, bitset) { + controller.step() + } + } else { + if matchBuiltin(cc, isStrict, bitset) { + controller.step() + } + } + case .consumeBy: let reg = payload.consumer guard currentPosition < searchBounds.upperBound, @@ -450,16 +578,14 @@ extension Processor { case .beginCapture: let capNum = Int( asserting: payload.capture.rawValue) + storedCaptures[capNum].startCapture(currentPosition) + controller.step() - storedCaptures[capNum].startCapture(currentPosition) - controller.step() - - case .endCapture: + case .endCapture: let capNum = Int( asserting: payload.capture.rawValue) - - storedCaptures[capNum].endCapture(currentPosition) - controller.step() + storedCaptures[capNum].endCapture(currentPosition) + controller.step() case .transformCapture: let (cap, trans) = payload.pairedCaptureTransform @@ -490,9 +616,6 @@ extension Processor { case .builtinAssertion: builtinAssertion() - - case .builtinCharacterClass: - builtinCharacterClass() } } } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index db2088782..87f1c708e 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -592,3 +592,95 @@ extension _CharacterClassModel { } } + +internal enum BuiltinCC: UInt64 { + case any = 1 + case anyGrapheme + case anyScalar + case digit + case hexDigit + case horizontalWhitespace + case newlineSequence + case verticalWhitespace + case whitespace + case word +} + +extension BuiltinCC { + func isStrict(options: MatchingOptions) -> Bool { + switch self { + case .digit: return options.usesASCIIDigits + case .hexDigit: return options.usesASCIIDigits + case .horizontalWhitespace: return options.usesASCIISpaces + case .newlineSequence: return options.usesASCIISpaces + case .whitespace: return options.usesASCIISpaces + case .word: return options.usesASCIIWord + default: return false + } + } + + /// A bitset representing the ascii values that this character class can match + var asciiBitset: DSLTree.CustomCharacterClass.AsciiBitset { + let allAscii = Array(0...127).map { Character(Unicode.Scalar($0)) } + let filtered: [Character] + switch self { + case .any: + filtered = allAscii + case .anyGrapheme: + filtered = allAscii + case .anyScalar: + filtered = allAscii + case .digit: + filtered = allAscii.filter { $0.isNumber } + case .hexDigit: + filtered = allAscii.filter { $0.isHexDigit } + case .horizontalWhitespace: + filtered = allAscii.filter { $0.unicodeScalars.first?.isHorizontalWhitespace == true } + case .newlineSequence: + filtered = allAscii.filter { $0.unicodeScalars.first?.isNewline == true } + case .verticalWhitespace: + filtered = allAscii.filter { $0.unicodeScalars.first?.isNewline == true } + case .whitespace: + filtered = allAscii.filter { $0.isWhitespace == true } + case .word: + filtered = allAscii.filter { $0.isWordCharacter } + } + var bitset = DSLTree.CustomCharacterClass.AsciiBitset(isInverted: false) + for c in filtered { bitset.add(c.asciiValue!, false) } + return bitset + } +} + +extension _CharacterClassModel { + internal var builtinCC: BuiltinCC? { + if isInverted { return nil } // lily todo: add another flag to the payload? when is this set? why are there so many weird edge cases in ccm? it feels like it's trying to model both builtins and custom models + + // in that case, should we just convert a ccm to a ccc + // if it has these weird flags set? + // completely remove ccm from compilation and just emit either a builtincc or a ccc or an advance + switch self.cc { + case .any: + return .any + case .anyGrapheme: + return .anyGrapheme + case .anyScalar: + return .anyScalar + case .digit: + return .digit + case .hexDigit: + return .hexDigit + case .horizontalWhitespace: + return .horizontalWhitespace + case .newlineSequence: + return .newlineSequence + case .verticalWhitespace: + return .verticalWhitespace + case .whitespace: + return .whitespace + case .word: + return .word + case .custom(_): + return nil + } + } +} From 139daa56a1c2a5a65536edceeed4ff736bb640c4 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 5 Jul 2022 14:37:10 -0700 Subject: [PATCH 03/35] Remove debug prints --- Sources/_StringProcessing/Compiler.swift | 2 -- 1 file changed, 2 deletions(-) diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 88e2ce8e3..bc6a4ec99 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -64,7 +64,6 @@ func _compileRegex( ) throws -> Executor { let ast = try parse(regex, syntax) let dsl: DSLTree - print(ast) switch semanticLevel?.base { case .graphemeCluster: let sequence = AST.MatchingOptionSequence(adding: [.init(.graphemeClusterSemantics, location: .fake)]) @@ -75,7 +74,6 @@ func _compileRegex( case .none: dsl = ast.dslTree } - print(dsl) let program = try Compiler(tree: dsl).emit() return Executor(program: program) } From 9abf4afd6309d3c57aca02ee8404d77d639c4c22 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 5 Jul 2022 16:22:07 -0700 Subject: [PATCH 04/35] Remove bitset fast path --- .../Engine/InstPayload.swift | 10 +++--- .../_StringProcessing/Engine/MEBuilder.swift | 2 +- .../_StringProcessing/Engine/Processor.swift | 31 +++---------------- 3 files changed, 11 insertions(+), 32 deletions(-) diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index deabf7231..0c10d6373 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -204,20 +204,20 @@ extension Instruction.Payload { interpret() } - init(_ cc: BuiltinCC, _ isStrict: Bool, _ isScalar: Bool, bitset: AsciiBitsetRegister) { + init(_ cc: BuiltinCC, _ isStrict: Bool, _ isScalar: Bool) { let strictBit = isStrict ? 1 << 15 : 0 let scalarBit = isScalar ? 1 << 14 : 0 // val must be 16 bits, reserve the top 2 bits for if it is strict ascii or scalar assert(cc.rawValue <= 0x3F_FF) let val = cc.rawValue + UInt64(strictBit) + UInt64(scalarBit) - self.init(val, bitset) + self.init(val) } - var builtinCCPayload: (cc: BuiltinCC, isStrict: Bool, isScalar: Bool, bitset: AsciiBitsetRegister) { - let (val, bitset): (UInt64, AsciiBitsetRegister) = self.interpretPair() + var builtinCCPayload: (cc: BuiltinCC, isStrict: Bool, isScalar: Bool) { + let val = self.rawValue let cc = BuiltinCC(rawValue: val & 0x3F_FF)! let isStrict = (val >> 15) & 1 == 1 let isScalar = (val >> 14) & 1 == 1 - return (cc, isStrict, isScalar, bitset) + return (cc, isStrict, isScalar) } init(consumer: ConsumeFunctionRegister) { diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index e1326fbd9..2ed6f599b 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -162,7 +162,7 @@ extension MEProgram.Builder { isScalar: Bool ) { instructions.append(.init( - .matchBuiltin, .init(cc, isStrict, isScalar, bitset: makeAsciiBitset(bitset)))) + .matchBuiltin, .init(cc, isStrict, isScalar))) } mutating func buildConsume( diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 05f91ae35..81e304b0f 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -247,22 +247,13 @@ extension Processor { mutating func matchBuiltin( _ cc: BuiltinCC, - _ isStrictAscii: Bool, - _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + _ isStrictAscii: Bool ) -> Bool { guard let c = load() else { signalFailure() return false } - // Fast path: See if c is a single scalar ascii character - // If so, and it matches, consume a character - // Note: CR-LF will fall through because it is not a single scalar - if bitset.matches(char: c) && cc != .anyScalar { - _uncheckedForcedConsumeOne() - return true - } - // Slow path: Do full match var matched: Bool var next = input.index(after: currentPosition) @@ -300,23 +291,12 @@ extension Processor { mutating func matchBuiltinScalar( _ cc: BuiltinCC, - _ isStrictAscii: Bool, - _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + _ isStrictAscii: Bool ) -> Bool { guard let c = loadScalar() else { signalFailure() return false } - - // Fast path: See if c is a single scalar ascii character - // If so, and it matches, consume a character - // Note: CR-LF must be matched fully if we are matching a .newlineSequence - // so exclude "\r" from the fast path - if bitset.matches(scalar: c) && cc != .anyGrapheme && c != "\r" { - input.unicodeScalars.formIndex(after: ¤tPosition) - return true - } - // Slow path: Do full match var matched: Bool var next = input.unicodeScalars.index(after: currentPosition) @@ -501,14 +481,13 @@ extension Processor { } case .matchBuiltin: - let (cc, isStrict, isScalar, reg) = payload.builtinCCPayload - let bitset = registers[reg] + let (cc, isStrict, isScalar) = payload.builtinCCPayload if isScalar { - if matchBuiltinScalar(cc, isStrict, bitset) { + if matchBuiltinScalar(cc, isStrict) { controller.step() } } else { - if matchBuiltin(cc, isStrict, bitset) { + if matchBuiltin(cc, isStrict) { controller.step() } } From 286f5d8b1ec3f0e7d2cb06ac672de86439612b3a Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 5 Jul 2022 17:15:53 -0700 Subject: [PATCH 05/35] Fully remove remnants of the bitset fast path --- Sources/_StringProcessing/ByteCodeGen.swift | 1 - .../_StringProcessing/Engine/MEBuilder.swift | 1 - .../_CharacterClassModel.swift | 31 ------------------- 3 files changed, 33 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 27607706b..1a38dec29 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -107,7 +107,6 @@ fileprivate extension Compiler.ByteCodeGen { builder.buildMatchBuiltin( cc, cc.isStrict(options: options), - cc.asciiBitset, isScalar: options.semanticLevel == .unicodeScalar) } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 2ed6f599b..3a7784f57 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -158,7 +158,6 @@ extension MEProgram.Builder { mutating func buildMatchBuiltin( _ cc: BuiltinCC, _ isStrict: Bool, - _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, isScalar: Bool ) { instructions.append(.init( diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 87f1c708e..3d23e5399 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -618,37 +618,6 @@ extension BuiltinCC { default: return false } } - - /// A bitset representing the ascii values that this character class can match - var asciiBitset: DSLTree.CustomCharacterClass.AsciiBitset { - let allAscii = Array(0...127).map { Character(Unicode.Scalar($0)) } - let filtered: [Character] - switch self { - case .any: - filtered = allAscii - case .anyGrapheme: - filtered = allAscii - case .anyScalar: - filtered = allAscii - case .digit: - filtered = allAscii.filter { $0.isNumber } - case .hexDigit: - filtered = allAscii.filter { $0.isHexDigit } - case .horizontalWhitespace: - filtered = allAscii.filter { $0.unicodeScalars.first?.isHorizontalWhitespace == true } - case .newlineSequence: - filtered = allAscii.filter { $0.unicodeScalars.first?.isNewline == true } - case .verticalWhitespace: - filtered = allAscii.filter { $0.unicodeScalars.first?.isNewline == true } - case .whitespace: - filtered = allAscii.filter { $0.isWhitespace == true } - case .word: - filtered = allAscii.filter { $0.isWordCharacter } - } - var bitset = DSLTree.CustomCharacterClass.AsciiBitset(isInverted: false) - for c in filtered { bitset.add(c.asciiValue!, false) } - return bitset - } } extension _CharacterClassModel { From e593ddb733dbd3a2217864f8963384f12651c222 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Mon, 11 Jul 2022 15:34:50 -0700 Subject: [PATCH 06/35] Completely replace AssertionFunction with regexAssert(by:) --- Sources/_StringProcessing/ByteCodeGen.swift | 136 +----------------- .../Engine/InstPayload.swift | 66 ++++++++- .../_StringProcessing/Engine/MEBuilder.swift | 29 ++-- .../_StringProcessing/Engine/MEProgram.swift | 9 -- .../_StringProcessing/Engine/Processor.swift | 125 ++++++++++++++-- .../_StringProcessing/Engine/Registers.swift | 9 -- .../Unicode/WordBreaking.swift | 33 +++++ .../_StringProcessing/Utility/TypedInt.swift | 4 - 8 files changed, 231 insertions(+), 180 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 2368791ee..d0f8c8266 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -127,136 +127,12 @@ fileprivate extension Compiler.ByteCodeGen { mutating func emitAssertion( _ kind: AST.Atom.AssertionKind ) throws { - // FIXME: Depends on API model we have... We may want to - // think through some of these with API interactions in mind - // - // This might break how we use `bounds` for both slicing - // and things like `firstIndex`, that is `firstIndex` may - // need to supply both a slice bounds and a per-search bounds. - switch kind { - case .startOfSubject: - builder.buildAssert { (_, _, input, pos, subjectBounds) in - pos == subjectBounds.lowerBound - } - - case .endOfSubjectBeforeNewline: - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input.index(after: pos) == subjectBounds.upperBound - && input[pos].isNewline - case .unicodeScalar: - return input.unicodeScalars.index(after: pos) == subjectBounds.upperBound - && input.unicodeScalars[pos].isNewline - } - } - - case .endOfSubject: - builder.buildAssert { (_, _, input, pos, subjectBounds) in - pos == subjectBounds.upperBound - } - - case .resetStartOfMatch: - // FIXME: Figure out how to communicate this out - throw Unsupported(#"\K (reset/keep assertion)"#) - - case .firstMatchingPositionInSubject: - // TODO: We can probably build a nice model with API here - - // FIXME: This needs to be based on `searchBounds`, - // not the `subjectBounds` given as an argument here - builder.buildAssert { (_, _, input, pos, subjectBounds) in false } - - case .textSegment: - builder.buildAssert { (_, _, input, pos, _) in - // FIXME: Grapheme or word based on options - input.isOnGraphemeClusterBoundary(pos) - } - - case .notTextSegment: - builder.buildAssert { (_, _, input, pos, _) in - // FIXME: Grapheme or word based on options - !input.isOnGraphemeClusterBoundary(pos) - } - - case .startOfLine: - // FIXME: Anchor.startOfLine must always use this first branch - // The behavior of `^` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.startOfLine` anchor should always match the start - // of a line. Right now we don't distinguish between those anchors. - if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.lowerBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[input.index(before: pos)].isNewline - case .unicodeScalar: - return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline - } - } - } else { - builder.buildAssert { (_, _, input, pos, subjectBounds) in - pos == subjectBounds.lowerBound - } - } - - case .endOfLine: - // FIXME: Anchor.endOfLine must always use this first branch - // The behavior of `$` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.endOfLine` anchor should always match the end - // of a line. Right now we don't distinguish between those anchors. - if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[pos].isNewline - case .unicodeScalar: - return input.unicodeScalars[pos].isNewline - } - } - } else { - builder.buildAssert { (_, _, input, pos, subjectBounds) in - pos == subjectBounds.upperBound - } - } - - case .wordBoundary: - builder.buildAssert { [options] - (cache, maxIndex, input, pos, subjectBounds) in - if options.usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? - return _CharacterClassModel.word.isBoundary( - input, - at: pos, - bounds: subjectBounds, - with: options - ) - } else { - return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex) - } - } - - case .notWordBoundary: - builder.buildAssert { [options] - (cache, maxIndex, input, pos, subjectBounds) in - if options.usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? - return !_CharacterClassModel.word.isBoundary( - input, - at: pos, - bounds: subjectBounds, - with: options - ) - } else { - return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex) - } - } - } + builder.buildAssert( + by: kind, + options.anchorsMatchNewlines, + options.usesSimpleUnicodeBoundaries, + options.usesASCIIWord, + options.semanticLevel) } mutating func emitScalar(_ s: UnicodeScalar) throws { diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 0c10d6373..137f5147a 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -8,7 +8,7 @@ // See https://swift.org/LICENSE.txt for license information // //===----------------------------------------------------------------------===// - +@_implementationOnly import _RegexParser // For AssertionKind extension Instruction { /// An instruction's payload packs operands and destination @@ -51,7 +51,6 @@ extension Instruction.Payload { case element(ElementRegister) case consumer(ConsumeFunctionRegister) case bitset(AsciiBitsetRegister) - case assertion(AssertionFunctionRegister) case addr(InstructionAddress) case capture(CaptureRegister) @@ -227,11 +226,64 @@ extension Instruction.Payload { interpret() } - init(assertion: AssertionFunctionRegister) { - self.init(assertion) - } - var assertion: AssertionFunctionRegister { - interpret() + var _assertionKindMask: UInt64 { ~0xFFF0_0000_0000_0000 } + init(assertion: AST.Atom.AssertionKind, + _ anchorsMatchNewlines: Bool, + _ usesSimpleUnicodeBoundaries: Bool, + _ usesASCIIWord: Bool, + _ semanticLevel: MatchingOptions.SemanticLevel + ) { + // 4 bits of options + let anchorBit: UInt64 = anchorsMatchNewlines ? (1 << 55) : 0 + let boundaryBit: UInt64 = usesSimpleUnicodeBoundaries ? (1 << 54) : 0 + let strictBit: UInt64 = usesASCIIWord ? (1 << 53) : 0 + let semanticLevelBit: UInt64 = semanticLevel == .unicodeScalar ? (1 << 52) : 0 + let optionsBits: UInt64 = anchorBit + boundaryBit + strictBit + semanticLevelBit + + // 4 bits for the assertion kind + // Future work: Optimize this layout + let kind: UInt64 + switch assertion { + case .endOfLine: kind = 0 + case .endOfSubject: kind = 1 + case .endOfSubjectBeforeNewline: kind = 2 + case .firstMatchingPositionInSubject: kind = 3 + case .notTextSegment: kind = 4 + case .notWordBoundary: kind = 5 + case .resetStartOfMatch: kind = 6 + case .startOfLine: kind = 7 + case .startOfSubject: kind = 8 + case .textSegment: kind = 9 + case .wordBoundary: kind = 10 + } + self.init(rawValue: kind + optionsBits) + } + var assertion: (AST.Atom.AssertionKind, Bool, Bool, Bool, MatchingOptions.SemanticLevel) { + let anchorsMatchNewlines = (self.rawValue >> 55) & 1 == 1 + let usesSimpleUnicodeBoundaries = (self.rawValue >> 54) & 1 == 1 + let usesASCIIWord = (self.rawValue >> 53) & 1 == 1 + let semanticLevel: MatchingOptions.SemanticLevel + if (self.rawValue >> 52) & 1 == 1 { + semanticLevel = .unicodeScalar + } else { + semanticLevel = .graphemeCluster + } + let kind: AST.Atom.AssertionKind + switch self.rawValue & _assertionKindMask { + case 0: kind = .endOfLine + case 1: kind = .endOfSubject + case 2: kind = .endOfSubjectBeforeNewline + case 3: kind = .firstMatchingPositionInSubject + case 4: kind = .notTextSegment + case 5: kind = .notWordBoundary + case 6: kind = .resetStartOfMatch + case 7: kind = .startOfLine + case 8: kind = .startOfSubject + case 9: kind = .textSegment + case 10: kind = .wordBoundary + default: fatalError("Unreachable") + } + return (kind, anchorsMatchNewlines, usesSimpleUnicodeBoundaries, usesASCIIWord, semanticLevel) } init(addr: InstructionAddress) { diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 3a7784f57..a4e02bb9f 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -20,7 +20,6 @@ extension MEProgram { var asciiBitsets: [DSLTree.CustomCharacterClass.AsciiBitset] = [] var consumeFunctions: [ConsumeFunction] = [] - var assertionFunctions: [AssertionFunction] = [] var transformFunctions: [TransformFunction] = [] var matcherFunctions: [MatcherFunction] = [] @@ -171,11 +170,27 @@ extension MEProgram.Builder { .consumeBy, .init(consumer: makeConsumeFunction(p)))) } +// mutating func buildAssert( +// by p: @escaping MEProgram.AssertionFunction +// ) { +// instructions.append(.init( +// .assertBy, .init(assertion: makeAssertionFunction(p)))) +// } mutating func buildAssert( - by p: @escaping MEProgram.AssertionFunction + by kind: AST.Atom.AssertionKind, + _ anchorsMatchNewlines: Bool, + _ usesSimpleUnicodeBoundaries: Bool, + _ usesASCIIWord: Bool, + _ semanticLevel: MatchingOptions.SemanticLevel ) { instructions.append(.init( - .assertBy, .init(assertion: makeAssertionFunction(p)))) + .assertBy, + .init( + assertion: kind, + anchorsMatchNewlines, + usesSimpleUnicodeBoundaries, + usesASCIIWord, + semanticLevel))) } mutating func buildAccept() { @@ -292,7 +307,6 @@ extension MEProgram.Builder { regInfo.values = nextValueRegister.rawValue regInfo.bitsets = asciiBitsets.count regInfo.consumeFunctions = consumeFunctions.count - regInfo.assertionFunctions = assertionFunctions.count regInfo.transformFunctions = transformFunctions.count regInfo.matcherFunctions = matcherFunctions.count regInfo.captures = nextCaptureRegister.rawValue @@ -303,7 +317,6 @@ extension MEProgram.Builder { staticSequences: sequences.stored, staticBitsets: asciiBitsets, staticConsumeFunctions: consumeFunctions, - staticAssertionFunctions: assertionFunctions, staticTransformFunctions: transformFunctions, staticMatcherFunctions: matcherFunctions, registerInfo: regInfo, @@ -446,12 +459,6 @@ extension MEProgram.Builder { defer { consumeFunctions.append(f) } return ConsumeFunctionRegister(consumeFunctions.count) } - mutating func makeAssertionFunction( - _ f: @escaping MEProgram.AssertionFunction - ) -> AssertionFunctionRegister { - defer { assertionFunctions.append(f) } - return AssertionFunctionRegister(assertionFunctions.count) - } mutating func makeTransformFunction( _ f: @escaping MEProgram.TransformFunction ) -> TransformRegister { diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index d311b4465..bacefb209 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -15,14 +15,6 @@ struct MEProgram { typealias Input = String typealias ConsumeFunction = (Input, Range) -> Input.Index? - typealias AssertionFunction = - ( - inout Set?, - inout String.Index?, - Input, - Input.Index, - Range - ) throws -> Bool typealias TransformFunction = (Input, Processor._StoredCapture) throws -> Any? typealias MatcherFunction = @@ -34,7 +26,6 @@ struct MEProgram { var staticSequences: [[Input.Element]] var staticBitsets: [DSLTree.CustomCharacterClass.AsciiBitset] var staticConsumeFunctions: [ConsumeFunction] - var staticAssertionFunctions: [AssertionFunction] var staticTransformFunctions: [TransformFunction] var staticMatcherFunctions: [MatcherFunction] diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 63b1a37f1..5e9d814d5 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -9,6 +9,8 @@ // //===----------------------------------------------------------------------===// +@_implementationOnly import _RegexParser // For AssertionKind + enum MatchMode { case wholeString case partialFromFront @@ -257,7 +259,6 @@ extension Processor { return false } - // Slow path: Do full match var matched: Bool var next = input.index(after: currentPosition) switch cc { @@ -300,7 +301,7 @@ extension Processor { signalFailure() return false } - // Slow path: Do full match + var matched: Bool var next = input.unicodeScalars.index(after: currentPosition) switch cc { @@ -339,6 +340,107 @@ extension Processor { } } + mutating func regexAssert( + by kind: AST.Atom.AssertionKind, + _ anchorsMatchNewlines: Bool, + _ usesSimpleUnicodeBoundaries: Bool, + _ usesASCIIWord: Bool, + _ semanticLevel: MatchingOptions.SemanticLevel + ) throws -> Bool { + // Future work: Optimize layout and dispatch + + // FIXME: Depends on API model we have... We may want to + // think through some of these with API interactions in mind + // + // This might break how we use `bounds` for both slicing + // and things like `firstIndex`, that is `firstIndex` may + // need to supply both a slice bounds and a per-search bounds. + switch kind { + case .startOfSubject: return currentPosition == subjectBounds.lowerBound + + case .endOfSubjectBeforeNewline: + if currentPosition == subjectBounds.upperBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input.index(after: currentPosition) == subjectBounds.upperBound + && input[currentPosition].isNewline + case .unicodeScalar: + return input.unicodeScalars.index(after: currentPosition) == subjectBounds.upperBound + && input.unicodeScalars[currentPosition].isNewline + } + + case .endOfSubject: return currentPosition == subjectBounds.upperBound + + case .resetStartOfMatch: + // FIXME: Figure out how to communicate this out + throw Unsupported(#"\K (reset/keep assertion)"#) + + case .firstMatchingPositionInSubject: + // TODO: We can probably build a nice model with API here + + // FIXME: This needs to be based on `searchBounds`, + // not the `subjectBounds` given as an argument here + // (Note: the above fixme was in reference to the old assert function API. + // Now that we're in processor, we have access to searchBounds) + return false + + case .textSegment: return input.isOnGraphemeClusterBoundary(currentPosition) + + case .notTextSegment: return !input.isOnGraphemeClusterBoundary(currentPosition) + + case .startOfLine: + // FIXME: Anchor.startOfLine must always use this first branch + // The behavior of `^` should depend on `anchorsMatchNewlines`, but + // the DSL-based `.startOfLine` anchor should always match the start + // of a line. Right now we don't distinguish between those anchors. + if anchorsMatchNewlines { + if currentPosition == subjectBounds.lowerBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input[input.index(before: currentPosition)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline + } + } else { + return currentPosition == subjectBounds.lowerBound + } + + case .endOfLine: + // FIXME: Anchor.endOfLine must always use this first branch + // The behavior of `$` should depend on `anchorsMatchNewlines`, but + // the DSL-based `.endOfLine` anchor should always match the end + // of a line. Right now we don't distinguish between those anchors. + if anchorsMatchNewlines { + if currentPosition == subjectBounds.upperBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input[currentPosition].isNewline + case .unicodeScalar: + return input.unicodeScalars[currentPosition].isNewline + } + } else { + return currentPosition == subjectBounds.upperBound + } + + case .wordBoundary: + if usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return atSimpleBoundary(usesASCIIWord, semanticLevel) + // lily note: there appear to be no test cases that use this option, ping alex to ask what they should look like + } else { + return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + } + + case .notWordBoundary: + if usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return !atSimpleBoundary(usesASCIIWord, semanticLevel) + } else { + return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + } + } + } + mutating func signalFailure() { guard let (pc, pos, stackEnd, capEnds, intRegisters) = savePoints.popLast()?.destructure @@ -508,15 +610,18 @@ extension Processor { controller.step() case .assertBy: - let reg = payload.assertion - let assertion = registers[reg] + let (kind, + anchorsMatchNewlines, + usesSimpleUnicodeBoundaries, + usesASCIIWord, + semanticLevel) = payload.assertion do { - guard try assertion( - &wordIndexCache, - &wordIndexMaxIndex, - input, - currentPosition, - subjectBounds + guard try regexAssert( + by: kind, + anchorsMatchNewlines, + usesSimpleUnicodeBoundaries, + usesASCIIWord, + semanticLevel ) else { signalFailure() return diff --git a/Sources/_StringProcessing/Engine/Registers.swift b/Sources/_StringProcessing/Engine/Registers.swift index c76413383..edc325a30 100644 --- a/Sources/_StringProcessing/Engine/Registers.swift +++ b/Sources/_StringProcessing/Engine/Registers.swift @@ -33,8 +33,6 @@ extension Processor { var consumeFunctions: [MEProgram.ConsumeFunction] - var assertionFunctions: [MEProgram.AssertionFunction] - // Captured-value constructors var transformFunctions: [MEProgram.TransformFunction] @@ -77,9 +75,6 @@ extension Processor.Registers { subscript(_ i: ConsumeFunctionRegister) -> MEProgram.ConsumeFunction { consumeFunctions[i.rawValue] } - subscript(_ i: AssertionFunctionRegister) -> MEProgram.AssertionFunction { - assertionFunctions[i.rawValue] - } subscript(_ i: TransformRegister) -> MEProgram.TransformFunction { transformFunctions[i.rawValue] } @@ -107,9 +102,6 @@ extension Processor.Registers { self.consumeFunctions = program.staticConsumeFunctions assert(consumeFunctions.count == info.consumeFunctions) - self.assertionFunctions = program.staticAssertionFunctions - assert(assertionFunctions.count == info.assertionFunctions) - self.transformFunctions = program.staticTransformFunctions assert(transformFunctions.count == info.transformFunctions) @@ -145,7 +137,6 @@ extension MEProgram { var strings = 0 var bitsets = 0 var consumeFunctions = 0 - var assertionFunctions = 0 var transformFunctions = 0 var matcherFunctions = 0 var ints = 0 diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift index 94c311e82..50da079f6 100644 --- a/Sources/_StringProcessing/Unicode/WordBreaking.swift +++ b/Sources/_StringProcessing/Unicode/WordBreaking.swift @@ -12,6 +12,39 @@ @_spi(_Unicode) import Swift +extension Processor { + func atSimpleBoundary( + _ usesAsciiWord: Bool, + _ semanticLevel: MatchingOptions.SemanticLevel + ) -> Bool { + func matchesWord(at i: Input.Index) -> Bool { + switch semanticLevel { + case .graphemeCluster: + let c = input[i] + return c.isWordCharacter && (c.isASCII || !usesAsciiWord) + case .unicodeScalar: + let c = input.unicodeScalars[i] + return (c.properties.isAlphabetic || c == "_") && (c.isASCII || !usesAsciiWord) + } + } + + // FIXME: How should we handle bounds? + // We probably need two concepts + if subjectBounds.isEmpty { return false } + if currentPosition == subjectBounds.lowerBound { + return matchesWord(at: currentPosition) + } + let priorIdx = input.index(before: currentPosition) + if currentPosition == subjectBounds.upperBound { + return matchesWord(at: priorIdx) + } + + let prior = matchesWord(at: priorIdx) + let current = matchesWord(at: currentPosition) + return prior != current + } +} + extension String { func isOnWordBoundary( at i: String.Index, diff --git a/Sources/_StringProcessing/Utility/TypedInt.swift b/Sources/_StringProcessing/Utility/TypedInt.swift index adc9edf78..e03f2572f 100644 --- a/Sources/_StringProcessing/Utility/TypedInt.swift +++ b/Sources/_StringProcessing/Utility/TypedInt.swift @@ -142,10 +142,6 @@ enum _AsciiBitsetRegister {} typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister> enum _ConsumeFunctionRegister {} -/// Used for assertion functions, e.g. anchors etc -typealias AssertionFunctionRegister = TypedInt<_AssertionFunctionRegister> -enum _AssertionFunctionRegister {} - /// Used for capture transforms, etc typealias TransformRegister = TypedInt<_TransformRegister> enum _TransformRegister {} From 3e38ac6026b956396a8eac55b2dfd79892968cf7 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Mon, 11 Jul 2022 18:13:20 -0700 Subject: [PATCH 07/35] Cleanup --- Sources/_StringProcessing/ByteCodeGen.swift | 14 ++----- Sources/_StringProcessing/Compiler.swift | 1 + .../Engine/Instruction.swift | 4 -- .../_StringProcessing/Engine/MEBuilder.swift | 6 --- .../_StringProcessing/Engine/MEBuiltins.swift | 9 ----- .../_StringProcessing/Engine/Processor.swift | 8 ---- .../_CharacterClassModel.swift | 37 ++----------------- 7 files changed, 8 insertions(+), 71 deletions(-) delete mode 100644 Sources/_StringProcessing/Engine/MEBuiltins.swift diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index a4e32f729..8e7aad6cd 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -82,7 +82,10 @@ fileprivate extension Compiler.ByteCodeGen { case let .unconverted(astAtom): if optimizationsEnabled, let cc = astAtom.ast.characterClass?.builtinCC { - emitBuiltinCharacterClass(cc) + builder.buildMatchBuiltin( + cc, + cc.isStrict(options: options), + isScalar: options.semanticLevel == .unicodeScalar) return } if let consumer = try astAtom.ast.generateConsumer(options) { @@ -114,15 +117,6 @@ fileprivate extension Compiler.ByteCodeGen { throw Unsupported("Backreference kind: \(ref)") } } - - mutating func emitBuiltinCharacterClass( - _ cc: BuiltinCC - ) { - builder.buildMatchBuiltin( - cc, - cc.isStrict(options: options), - isScalar: options.semanticLevel == .unicodeScalar) - } mutating func emitAssertion( _ kind: AST.Atom.AssertionKind diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 97466dd66..530126a32 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -68,6 +68,7 @@ func _compileRegex( ) throws -> Executor { let ast = try parse(regex, syntax) let dsl: DSLTree + switch semanticLevel?.base { case .graphemeCluster: let sequence = AST.MatchingOptionSequence(adding: [.init(.graphemeClusterSemantics, location: .fake)]) diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index a2ffa2d8a..6bccb294b 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -106,10 +106,6 @@ extension Instruction { /// Operand: Ascii bitset register containing the bitset case matchBitset - /// TODO: builtin assertions and anchors - case builtinAssertion - - /// TODO: builtin character classes case matchBuiltin // MARK: Extension points diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 3378369c4..fd8b97beb 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -179,12 +179,6 @@ extension MEProgram.Builder { .consumeBy, .init(consumer: makeConsumeFunction(p)))) } -// mutating func buildAssert( -// by p: @escaping MEProgram.AssertionFunction -// ) { -// instructions.append(.init( -// .assertBy, .init(assertion: makeAssertionFunction(p)))) -// } mutating func buildAssert( by kind: AST.Atom.AssertionKind, _ anchorsMatchNewlines: Bool, diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift deleted file mode 100644 index 720a41618..000000000 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ /dev/null @@ -1,9 +0,0 @@ - - -extension Processor { - - - mutating func builtinAssertion() { - fatalError("TODO: assertions and anchors") - } -} diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index c4fc139ae..5c557dc81 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -262,8 +262,6 @@ extension Processor { var matched: Bool var next = input.index(after: currentPosition) switch cc { - // lily note: when do these `any` cases appear? can they be compiled - // into consume instructions at compile time? case .any, .anyGrapheme: matched = true case .anyScalar: matched = true @@ -320,8 +318,6 @@ extension Processor { matched = c.isNewline && (c.isASCII || !isStrictAscii) case .newlineSequence: matched = c.isNewline && (c.isASCII || !isStrictAscii) - // lily note: what exactly is this doing? matching a full cr-lf character - // even though its in scalar mode? why? if c == "\r" && next != input.endIndex && input.unicodeScalars[next] == "\n" { input.unicodeScalars.formIndex(after: &next) } @@ -426,7 +422,6 @@ extension Processor { if usesSimpleUnicodeBoundaries { // TODO: How should we handle bounds? return atSimpleBoundary(usesASCIIWord, semanticLevel) - // lily note: there appear to be no test cases that use this option, ping alex to ask what they should look like } else { return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) } @@ -716,9 +711,6 @@ extension Processor { storedCaptures[capNum].registerValue( value, overwriteInitial: sp) controller.step() - - case .builtinAssertion: - builtinAssertion() } } } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 3d23e5399..d91dd4f65 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -564,35 +564,6 @@ extension DSLTree.CustomCharacterClass { } } -extension _CharacterClassModel { - // FIXME: Calling on inverted sets wont be the same as the - // inverse of a boundary if at the start or end of the - // string. (Think through what we want: do it ourselves or - // give the caller both options). - func isBoundary( - _ input: String, - at pos: String.Index, - bounds: Range, - with options: MatchingOptions - ) -> Bool { - // FIXME: How should we handle bounds? - // We probably need two concepts - if bounds.isEmpty { return false } - if pos == bounds.lowerBound { - return self.matches(in: input, at: pos, with: options) != nil - } - let priorIdx = input.index(before: pos) - if pos == bounds.upperBound { - return self.matches(in: input, at: priorIdx, with: options) != nil - } - - let prior = self.matches(in: input, at: priorIdx, with: options) != nil - let current = self.matches(in: input, at: pos, with: options) != nil - return prior != current - } - -} - internal enum BuiltinCC: UInt64 { case any = 1 case anyGrapheme @@ -622,11 +593,9 @@ extension BuiltinCC { extension _CharacterClassModel { internal var builtinCC: BuiltinCC? { - if isInverted { return nil } // lily todo: add another flag to the payload? when is this set? why are there so many weird edge cases in ccm? it feels like it's trying to model both builtins and custom models - - // in that case, should we just convert a ccm to a ccc - // if it has these weird flags set? - // completely remove ccm from compilation and just emit either a builtincc or a ccc or an advance + // Future work: Make CCM always either a BuiltinCC or convertable to a + // custom character class + if isInverted { return nil } switch self.cc { case .any: return .any From e5d8b4a05dfb5e4b2017eb8a03f34ff47ddd53c4 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 12 Jul 2022 12:19:45 -0700 Subject: [PATCH 08/35] Move match builtin and assert + Add AssertionPayload --- .../Engine/InstPayload.swift | 64 +---- .../_StringProcessing/Engine/MEBuilder.swift | 13 +- .../_StringProcessing/Engine/MEBuiltins.swift | 256 ++++++++++++++++++ .../_StringProcessing/Engine/Processor.swift | 201 +------------- 4 files changed, 270 insertions(+), 264 deletions(-) create mode 100644 Sources/_StringProcessing/Engine/MEBuiltins.swift diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 5a3d0a55d..31ff6c369 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -8,7 +8,6 @@ // See https://swift.org/LICENSE.txt for license information // //===----------------------------------------------------------------------===// -@_implementationOnly import _RegexParser // For AssertionKind extension Instruction { /// An instruction's payload packs operands and destination @@ -226,64 +225,11 @@ extension Instruction.Payload { interpret() } - var _assertionKindMask: UInt64 { ~0xFFF0_0000_0000_0000 } - init(assertion: AST.Atom.AssertionKind, - _ anchorsMatchNewlines: Bool, - _ usesSimpleUnicodeBoundaries: Bool, - _ usesASCIIWord: Bool, - _ semanticLevel: MatchingOptions.SemanticLevel - ) { - // 4 bits of options - let anchorBit: UInt64 = anchorsMatchNewlines ? (1 << 55) : 0 - let boundaryBit: UInt64 = usesSimpleUnicodeBoundaries ? (1 << 54) : 0 - let strictBit: UInt64 = usesASCIIWord ? (1 << 53) : 0 - let semanticLevelBit: UInt64 = semanticLevel == .unicodeScalar ? (1 << 52) : 0 - let optionsBits: UInt64 = anchorBit + boundaryBit + strictBit + semanticLevelBit - - // 4 bits for the assertion kind - // Future work: Optimize this layout - let kind: UInt64 - switch assertion { - case .endOfLine: kind = 0 - case .endOfSubject: kind = 1 - case .endOfSubjectBeforeNewline: kind = 2 - case .firstMatchingPositionInSubject: kind = 3 - case .notTextSegment: kind = 4 - case .notWordBoundary: kind = 5 - case .resetStartOfMatch: kind = 6 - case .startOfLine: kind = 7 - case .startOfSubject: kind = 8 - case .textSegment: kind = 9 - case .wordBoundary: kind = 10 - } - self.init(rawValue: kind + optionsBits) - } - var assertion: (AST.Atom.AssertionKind, Bool, Bool, Bool, MatchingOptions.SemanticLevel) { - let anchorsMatchNewlines = (self.rawValue >> 55) & 1 == 1 - let usesSimpleUnicodeBoundaries = (self.rawValue >> 54) & 1 == 1 - let usesASCIIWord = (self.rawValue >> 53) & 1 == 1 - let semanticLevel: MatchingOptions.SemanticLevel - if (self.rawValue >> 52) & 1 == 1 { - semanticLevel = .unicodeScalar - } else { - semanticLevel = .graphemeCluster - } - let kind: AST.Atom.AssertionKind - switch self.rawValue & _assertionKindMask { - case 0: kind = .endOfLine - case 1: kind = .endOfSubject - case 2: kind = .endOfSubjectBeforeNewline - case 3: kind = .firstMatchingPositionInSubject - case 4: kind = .notTextSegment - case 5: kind = .notWordBoundary - case 6: kind = .resetStartOfMatch - case 7: kind = .startOfLine - case 8: kind = .startOfSubject - case 9: kind = .textSegment - case 10: kind = .wordBoundary - default: fatalError("Unreachable") - } - return (kind, anchorsMatchNewlines, usesSimpleUnicodeBoundaries, usesASCIIWord, semanticLevel) + init(assertion payload: AssertionPayload) { + self.init(rawValue: payload.rawValue) + } + var assertion: AssertionPayload { + AssertionPayload.init(rawValue: self.rawValue & _payloadMask) } init(addr: InstructionAddress) { diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index fd8b97beb..cc1beac60 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -186,14 +186,15 @@ extension MEProgram.Builder { _ usesASCIIWord: Bool, _ semanticLevel: MatchingOptions.SemanticLevel ) { + let payload = AssertionPayload.init( + kind, + anchorsMatchNewlines, + usesSimpleUnicodeBoundaries, + usesASCIIWord, + semanticLevel) instructions.append(.init( .assertBy, - .init( - assertion: kind, - anchorsMatchNewlines, - usesSimpleUnicodeBoundaries, - usesASCIIWord, - semanticLevel))) + .init(assertion: payload))) } mutating func buildAccept() { diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift new file mode 100644 index 000000000..8d7989a50 --- /dev/null +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -0,0 +1,256 @@ +@_implementationOnly import _RegexParser // For AssertionKind + +extension Processor { + mutating func matchBuiltin( + _ cc: BuiltinCC, + _ isStrictAscii: Bool + ) -> Bool { + guard let c = load() else { + signalFailure() + return false + } + + var matched: Bool + var next = input.index(after: currentPosition) + switch cc { + case .any, .anyGrapheme: matched = true + case .anyScalar: + matched = true + next = input.unicodeScalars.index(after: currentPosition) + case .digit: + matched = c.isNumber && (c.isASCII || !isStrictAscii) + case .hexDigit: + matched = c.isHexDigit && (c.isASCII || !isStrictAscii) + case .horizontalWhitespace: + matched = c.unicodeScalars.first?.isHorizontalWhitespace == true + && (c.isASCII || !isStrictAscii) + case .newlineSequence, .verticalWhitespace: + matched = c.unicodeScalars.first?.isNewline == true + && (c.isASCII || !isStrictAscii) + case .whitespace: + matched = c.isWhitespace && (c.isASCII || !isStrictAscii) + case .word: + matched = c.isWordCharacter && (c.isASCII || !isStrictAscii) + } + + if matched { + currentPosition = next + return true + } else { + signalFailure() + return false + } + } + + mutating func matchBuiltinScalar( + _ cc: BuiltinCC, + _ isStrictAscii: Bool + ) -> Bool { + guard let c = loadScalar() else { + signalFailure() + return false + } + + var matched: Bool + var next = input.unicodeScalars.index(after: currentPosition) + switch cc { + case .any: matched = true + case .anyScalar: matched = true + case .anyGrapheme: + matched = true + next = input.index(after: currentPosition) + case .digit: + matched = c.properties.numericType != nil && (c.isASCII || !isStrictAscii) + case .hexDigit: + matched = Character(c).isHexDigit && (c.isASCII || !isStrictAscii) + case .horizontalWhitespace: + matched = c.isHorizontalWhitespace && (c.isASCII || !isStrictAscii) + case .verticalWhitespace: + matched = c.isNewline && (c.isASCII || !isStrictAscii) + case .newlineSequence: + matched = c.isNewline && (c.isASCII || !isStrictAscii) + if c == "\r" && next != input.endIndex && input.unicodeScalars[next] == "\n" { + input.unicodeScalars.formIndex(after: &next) + } + case .whitespace: + matched = c.properties.isWhitespace && (c.isASCII || !isStrictAscii) + case .word: + matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !isStrictAscii) + } + + if matched { + currentPosition = next + return true + } else { + signalFailure() + return false + } + } + + mutating func regexAssert(by payload: AssertionPayload) throws -> Bool { + // Future work: Optimize layout and dispatch + + // FIXME: Depends on API model we have... We may want to + // think through some of these with API interactions in mind + // + // This might break how we use `bounds` for both slicing + // and things like `firstIndex`, that is `firstIndex` may + // need to supply both a slice bounds and a per-search bounds. + switch payload.kind { + case .startOfSubject: return currentPosition == subjectBounds.lowerBound + + case .endOfSubjectBeforeNewline: + if currentPosition == subjectBounds.upperBound { return true } + switch payload.semanticLevel { + case .graphemeCluster: + return input.index(after: currentPosition) == subjectBounds.upperBound + && input[currentPosition].isNewline + case .unicodeScalar: + return input.unicodeScalars.index(after: currentPosition) == subjectBounds.upperBound + && input.unicodeScalars[currentPosition].isNewline + } + + case .endOfSubject: return currentPosition == subjectBounds.upperBound + + case .resetStartOfMatch: + // FIXME: Figure out how to communicate this out + throw Unsupported(#"\K (reset/keep assertion)"#) + + case .firstMatchingPositionInSubject: + // TODO: We can probably build a nice model with API here + + // FIXME: This needs to be based on `searchBounds`, + // not the `subjectBounds` given as an argument here + // (Note: the above fixme was in reference to the old assert function API. + // Now that we're in processor, we have access to searchBounds) + return false + + case .textSegment: return input.isOnGraphemeClusterBoundary(currentPosition) + + case .notTextSegment: return !input.isOnGraphemeClusterBoundary(currentPosition) + + case .startOfLine: + // FIXME: Anchor.startOfLine must always use this first branch + // The behavior of `^` should depend on `anchorsMatchNewlines`, but + // the DSL-based `.startOfLine` anchor should always match the start + // of a line. Right now we don't distinguish between those anchors. + if payload.anchorsMatchNewlines { + if currentPosition == subjectBounds.lowerBound { return true } + switch payload.semanticLevel { + case .graphemeCluster: + return input[input.index(before: currentPosition)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline + } + } else { + return currentPosition == subjectBounds.lowerBound + } + + case .endOfLine: + // FIXME: Anchor.endOfLine must always use this first branch + // The behavior of `$` should depend on `anchorsMatchNewlines`, but + // the DSL-based `.endOfLine` anchor should always match the end + // of a line. Right now we don't distinguish between those anchors. + if payload.anchorsMatchNewlines { + if currentPosition == subjectBounds.upperBound { return true } + switch payload.semanticLevel { + case .graphemeCluster: + return input[currentPosition].isNewline + case .unicodeScalar: + return input.unicodeScalars[currentPosition].isNewline + } + } else { + return currentPosition == subjectBounds.upperBound + } + + case .wordBoundary: + if payload.usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) + } else { + return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + } + + case .notWordBoundary: + if payload.usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return !atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) + } else { + return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + } + } + } +} + +struct AssertionPayload: RawRepresentable { + var _assertionKindMask: UInt64 { ~0xFFF0_0000_0000_0000 } + var _opcodeMask: UInt64 { 0xFF00_0000_0000_0000 } + + let rawValue: UInt64 + + init(rawValue: UInt64) { + self.rawValue = rawValue + assert(rawValue & _opcodeMask == 0) + } + + init(_ assertion: AST.Atom.AssertionKind, + _ anchorsMatchNewlines: Bool, + _ usesSimpleUnicodeBoundaries: Bool, + _ usesASCIIWord: Bool, + _ semanticLevel: MatchingOptions.SemanticLevel + ) { + // 4 bits of options + let anchorBit: UInt64 = anchorsMatchNewlines ? (1 << 55) : 0 + let boundaryBit: UInt64 = usesSimpleUnicodeBoundaries ? (1 << 54) : 0 + let strictBit: UInt64 = usesASCIIWord ? (1 << 53) : 0 + let semanticLevelBit: UInt64 = semanticLevel == .unicodeScalar ? (1 << 52) : 0 + let optionsBits: UInt64 = anchorBit + boundaryBit + strictBit + semanticLevelBit + + // 4 bits for the assertion kind + // Future work: Optimize this layout + let kind: UInt64 + switch assertion { + case .endOfLine: kind = 0 + case .endOfSubject: kind = 1 + case .endOfSubjectBeforeNewline: kind = 2 + case .firstMatchingPositionInSubject: kind = 3 + case .notTextSegment: kind = 4 + case .notWordBoundary: kind = 5 + case .resetStartOfMatch: kind = 6 + case .startOfLine: kind = 7 + case .startOfSubject: kind = 8 + case .textSegment: kind = 9 + case .wordBoundary: kind = 10 + } + self.init(rawValue: kind + optionsBits) + } + + var kind: AST.Atom.AssertionKind { + let kind: AST.Atom.AssertionKind + switch self.rawValue & _assertionKindMask { + case 0: kind = .endOfLine + case 1: kind = .endOfSubject + case 2: kind = .endOfSubjectBeforeNewline + case 3: kind = .firstMatchingPositionInSubject + case 4: kind = .notTextSegment + case 5: kind = .notWordBoundary + case 6: kind = .resetStartOfMatch + case 7: kind = .startOfLine + case 8: kind = .startOfSubject + case 9: kind = .textSegment + case 10: kind = .wordBoundary + default: fatalError("Unreachable") + } + return kind + } + var anchorsMatchNewlines: Bool { (self.rawValue >> 55) & 1 == 1 } + var usesSimpleUnicodeBoundaries: Bool { (self.rawValue >> 54) & 1 == 1 } + var usesASCIIWord: Bool { (self.rawValue >> 53) & 1 == 1 } + var semanticLevel: MatchingOptions.SemanticLevel { + if (self.rawValue >> 52) & 1 == 1 { + return .unicodeScalar + } else { + return .graphemeCluster + } + } +} diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 5c557dc81..b5bf804de 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -9,7 +9,6 @@ // //===----------------------------------------------------------------------===// -@_implementationOnly import _RegexParser // For AssertionKind enum MatchMode { case wholeString @@ -249,192 +248,6 @@ extension Processor { _uncheckedForcedConsumeOne() return true } - - mutating func matchBuiltin( - _ cc: BuiltinCC, - _ isStrictAscii: Bool - ) -> Bool { - guard let c = load() else { - signalFailure() - return false - } - - var matched: Bool - var next = input.index(after: currentPosition) - switch cc { - case .any, .anyGrapheme: matched = true - case .anyScalar: - matched = true - next = input.unicodeScalars.index(after: currentPosition) - case .digit: - matched = c.isNumber && (c.isASCII || !isStrictAscii) - case .hexDigit: - matched = c.isHexDigit && (c.isASCII || !isStrictAscii) - case .horizontalWhitespace: - matched = c.unicodeScalars.first?.isHorizontalWhitespace == true - && (c.isASCII || !isStrictAscii) - case .newlineSequence, .verticalWhitespace: - matched = c.unicodeScalars.first?.isNewline == true - && (c.isASCII || !isStrictAscii) - case .whitespace: - matched = c.isWhitespace && (c.isASCII || !isStrictAscii) - case .word: - matched = c.isWordCharacter && (c.isASCII || !isStrictAscii) - } - - if matched { - currentPosition = next - return true - } else { - signalFailure() - return false - } - } - - mutating func matchBuiltinScalar( - _ cc: BuiltinCC, - _ isStrictAscii: Bool - ) -> Bool { - guard let c = loadScalar() else { - signalFailure() - return false - } - - var matched: Bool - var next = input.unicodeScalars.index(after: currentPosition) - switch cc { - case .any: matched = true - case .anyScalar: matched = true - case .anyGrapheme: - matched = true - next = input.index(after: currentPosition) - case .digit: - matched = c.properties.numericType != nil && (c.isASCII || !isStrictAscii) - case .hexDigit: - matched = Character(c).isHexDigit && (c.isASCII || !isStrictAscii) - case .horizontalWhitespace: - matched = c.isHorizontalWhitespace && (c.isASCII || !isStrictAscii) - case .verticalWhitespace: - matched = c.isNewline && (c.isASCII || !isStrictAscii) - case .newlineSequence: - matched = c.isNewline && (c.isASCII || !isStrictAscii) - if c == "\r" && next != input.endIndex && input.unicodeScalars[next] == "\n" { - input.unicodeScalars.formIndex(after: &next) - } - case .whitespace: - matched = c.properties.isWhitespace && (c.isASCII || !isStrictAscii) - case .word: - matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !isStrictAscii) - } - - if matched { - currentPosition = next - return true - } else { - signalFailure() - return false - } - } - - mutating func regexAssert( - by kind: AST.Atom.AssertionKind, - _ anchorsMatchNewlines: Bool, - _ usesSimpleUnicodeBoundaries: Bool, - _ usesASCIIWord: Bool, - _ semanticLevel: MatchingOptions.SemanticLevel - ) throws -> Bool { - // Future work: Optimize layout and dispatch - - // FIXME: Depends on API model we have... We may want to - // think through some of these with API interactions in mind - // - // This might break how we use `bounds` for both slicing - // and things like `firstIndex`, that is `firstIndex` may - // need to supply both a slice bounds and a per-search bounds. - switch kind { - case .startOfSubject: return currentPosition == subjectBounds.lowerBound - - case .endOfSubjectBeforeNewline: - if currentPosition == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input.index(after: currentPosition) == subjectBounds.upperBound - && input[currentPosition].isNewline - case .unicodeScalar: - return input.unicodeScalars.index(after: currentPosition) == subjectBounds.upperBound - && input.unicodeScalars[currentPosition].isNewline - } - - case .endOfSubject: return currentPosition == subjectBounds.upperBound - - case .resetStartOfMatch: - // FIXME: Figure out how to communicate this out - throw Unsupported(#"\K (reset/keep assertion)"#) - - case .firstMatchingPositionInSubject: - // TODO: We can probably build a nice model with API here - - // FIXME: This needs to be based on `searchBounds`, - // not the `subjectBounds` given as an argument here - // (Note: the above fixme was in reference to the old assert function API. - // Now that we're in processor, we have access to searchBounds) - return false - - case .textSegment: return input.isOnGraphemeClusterBoundary(currentPosition) - - case .notTextSegment: return !input.isOnGraphemeClusterBoundary(currentPosition) - - case .startOfLine: - // FIXME: Anchor.startOfLine must always use this first branch - // The behavior of `^` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.startOfLine` anchor should always match the start - // of a line. Right now we don't distinguish between those anchors. - if anchorsMatchNewlines { - if currentPosition == subjectBounds.lowerBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[input.index(before: currentPosition)].isNewline - case .unicodeScalar: - return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline - } - } else { - return currentPosition == subjectBounds.lowerBound - } - - case .endOfLine: - // FIXME: Anchor.endOfLine must always use this first branch - // The behavior of `$` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.endOfLine` anchor should always match the end - // of a line. Right now we don't distinguish between those anchors. - if anchorsMatchNewlines { - if currentPosition == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[currentPosition].isNewline - case .unicodeScalar: - return input.unicodeScalars[currentPosition].isNewline - } - } else { - return currentPosition == subjectBounds.upperBound - } - - case .wordBoundary: - if usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? - return atSimpleBoundary(usesASCIIWord, semanticLevel) - } else { - return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) - } - - case .notWordBoundary: - if usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? - return !atSimpleBoundary(usesASCIIWord, semanticLevel) - } else { - return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) - } - } - } mutating func signalFailure() { guard let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = @@ -615,19 +428,9 @@ extension Processor { controller.step() case .assertBy: - let (kind, - anchorsMatchNewlines, - usesSimpleUnicodeBoundaries, - usesASCIIWord, - semanticLevel) = payload.assertion + let payload = payload.assertion do { - guard try regexAssert( - by: kind, - anchorsMatchNewlines, - usesSimpleUnicodeBoundaries, - usesASCIIWord, - semanticLevel - ) else { + guard try regexAssert(by: payload) else { signalFailure() return } From 0466c25423115eb1546b3dfde7b4d1567b17b9c3 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 12 Jul 2022 12:29:52 -0700 Subject: [PATCH 09/35] Cleanup assertions --- Sources/_StringProcessing/ByteCodeGen.swift | 3 +++ .../_StringProcessing/Engine/MEBuiltins.swift | 20 +++---------------- .../_StringProcessing/Engine/Processor.swift | 2 +- 3 files changed, 7 insertions(+), 18 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 8e7aad6cd..c40ca2066 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -121,6 +121,9 @@ fileprivate extension Compiler.ByteCodeGen { mutating func emitAssertion( _ kind: AST.Atom.AssertionKind ) throws { + if kind == .resetStartOfMatch { + throw Unsupported(#"\K (reset/keep assertion)"#) + } builder.buildAssert( by: kind, options.anchorsMatchNewlines, diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 8d7989a50..f79e8f463 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -87,15 +87,8 @@ extension Processor { } } - mutating func regexAssert(by payload: AssertionPayload) throws -> Bool { + mutating func builtinAssert(by payload: AssertionPayload) throws -> Bool { // Future work: Optimize layout and dispatch - - // FIXME: Depends on API model we have... We may want to - // think through some of these with API interactions in mind - // - // This might break how we use `bounds` for both slicing - // and things like `firstIndex`, that is `firstIndex` may - // need to supply both a slice bounds and a per-search bounds. switch payload.kind { case .startOfSubject: return currentPosition == subjectBounds.lowerBound @@ -113,17 +106,10 @@ extension Processor { case .endOfSubject: return currentPosition == subjectBounds.upperBound case .resetStartOfMatch: - // FIXME: Figure out how to communicate this out - throw Unsupported(#"\K (reset/keep assertion)"#) + fatalError("Unreachable, we should have thrown an error during compilation") case .firstMatchingPositionInSubject: - // TODO: We can probably build a nice model with API here - - // FIXME: This needs to be based on `searchBounds`, - // not the `subjectBounds` given as an argument here - // (Note: the above fixme was in reference to the old assert function API. - // Now that we're in processor, we have access to searchBounds) - return false + return currentPosition == searchBounds.lowerBound case .textSegment: return input.isOnGraphemeClusterBoundary(currentPosition) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index b5bf804de..4841d03e8 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -430,7 +430,7 @@ extension Processor { case .assertBy: let payload = payload.assertion do { - guard try regexAssert(by: payload) else { + guard try builtinAssert(by: payload) else { signalFailure() return } From 1ef91f3e3291d72b4776f834e75d2969fdcab555 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 12 Jul 2022 17:28:21 -0700 Subject: [PATCH 10/35] First version --- Sources/_StringProcessing/ByteCodeGen.swift | 90 +++++++++++ .../Engine/Backtracking.swift | 54 ++++++- .../Engine/InstPayload.swift | 141 +++++++++++++++++ .../Engine/Instruction.swift | 7 + .../_StringProcessing/Engine/MEBuilder.swift | 43 +++++ .../_StringProcessing/Engine/MEBuiltins.swift | 19 ++- .../_StringProcessing/Engine/Processor.swift | 147 ++++++++++++++++-- .../_StringProcessing/Engine/Tracing.swift | 20 +++ Tests/RegexTests/CompileTests.swift | 16 +- Tests/RegexTests/MatchTests.swift | 5 + 10 files changed, 510 insertions(+), 32 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index de58d9d94..4c6cb5893 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -454,6 +454,30 @@ fileprivate extension Compiler.ByteCodeGen { let minTrips = low assert((extraTrips ?? 1) >= 0) + // We want to specialize quantification on certain inner nodes + // Those nodes are: + // - .char + // - .customCharacterClass + // - built in character classes + // - .any + // and only in grapheme semantic mode (fixme: for sure?) + // We do this by wrapping a single instruction in a .quantify instruction + + // Lily note: I dont think we can support reluctant quant with this implementation + // style, or at least it wouldn't be any more efficient than the + // existing way we emit reluctant quantifiers + + // The main issue runQuantify solves is the fact that greedy quantifiers + // will loop through processor inefficiently and generate a ton of save points + let x = 65536 // lily todo: fix this once i determine the bit layout + if optimizationsEnabled && child.shouldDoFastQuant(options) && + minTrips < x && + extraTrips ?? 0 < x && + options.matchLevel == .graphemeCluster && + updatedKind != .reluctant { + emitFastQuant(child, updatedKind, minTrips, extraTrips) + } + // The below is a general algorithm for bounded and unbounded // quantification. It can be specialized when the min // is 0 or 1, or when extra trips is 1 or unbounded. @@ -638,6 +662,37 @@ fileprivate extension Compiler.ByteCodeGen { builder.label(exit) } + mutating func emitFastQuant( + _ child: DSLTree.Node, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + // These cases must stay in sync with DSLTree.Node.shouldDoFastQuant + // as well as the compilation paths for these nodes outside of quantification + + // Coupling is bad but we do it for _speed_ + switch child { + case .customCharacterClass(let ccc): + builder.buildQuantify(bitset: ccc.asAsciiBitset(options)!, kind, minTrips, extraTrips) + case .atom(let atom): + switch atom { + case .char(let c): + builder.buildQuantify(asciiChar: c._singleScalarAsciiValue!, kind, minTrips, extraTrips) + case .any: + builder.buildQuantifyAny(kind, minTrips, extraTrips) + case .unconverted(let astAtom): + builder.buildQuantify(builtin: astAtom.ast.characterClass!.builtinCC!, kind, minTrips, extraTrips) + default: + fatalError("Entered emitFastQuant with an invalid case: DSLTree.Node.shouldDoFastQuant is out of sync") + } + case .convertedRegexLiteral(let node, _): + emitFastQuant(node, kind, minTrips, extraTrips) + default: + fatalError("Entered emitFastQuant with an invalid case: DSLTree.Node.shouldDoFastQuant is out of sync") + } + } + mutating func emitCustomCharacterClass( _ ccc: DSLTree.CustomCharacterClass ) throws { @@ -784,4 +839,39 @@ extension DSLTree.Node { default: return false } } + + /// If the given node can be wrapped in a .quantify instruction + /// Currently this is conservative to reduce the coupling in ByteCodeGen between the normal case and + /// the quantified cases + /// + /// Essentially we trade off implementation complexity for runtime speed by adding more true cases to this + func shouldDoFastQuant(_ opts: MatchingOptions) -> Bool { + switch self { + case .customCharacterClass(let ccc): + // Only quantify ascii only character classes + return ccc.asAsciiBitset(opts) != nil + case .atom(let atom): + switch atom { + case .char(let c): + // Only quantify the most common path -> Single scalar ascii values + return c._singleScalarAsciiValue != nil + case .any: + // Only quantify if we have a default behavior .any + return !opts.dotMatchesNewline + case .unconverted(let astAtom): + // Only quantify non-strict built in character classes + if let builtin = astAtom.ast.characterClass?.builtinCC { + return !builtin.isStrict(options: opts) + } else { + return false + } + default: + return false + } + case .convertedRegexLiteral(let node, _): + return node.shouldDoFastQuant(opts) + default: + return false + } + } } diff --git a/Sources/_StringProcessing/Engine/Backtracking.swift b/Sources/_StringProcessing/Engine/Backtracking.swift index 355702ac1..a92477020 100644 --- a/Sources/_StringProcessing/Engine/Backtracking.swift +++ b/Sources/_StringProcessing/Engine/Backtracking.swift @@ -10,11 +10,19 @@ //===----------------------------------------------------------------------===// extension Processor { - - // TODO: What all do we want to save? Configurable? - // TODO: Do we need to save any registers? - // TODO: Is this the right place to do function stack unwinding? - struct SavePoint { + enum SavePoint { + case basic(BasicSavePoint) + case quant(QuantifierSavePoint) + + var pc: InstructionAddress { + switch self { + case .basic(let sp): return sp.pc + case .quant(let sp): return sp.pc + } + } + } + + struct BasicSavePoint { var pc: InstructionAddress var pos: Position? @@ -46,17 +54,51 @@ extension Processor { (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters) } } + + struct QuantifierSavePoint { + var pc: InstructionAddress + var quantifiedPositions: [Position] + var stackEnd: CallStackAddress + var captureEnds: [_StoredCapture] + var intRegisters: [Int] + var posRegisters: [Input.Index] + + + mutating func pop() -> ( + pc: InstructionAddress, + pos: Position, + stackEnd: CallStackAddress, + captureEnds: [_StoredCapture], + intRegisters: [Int], + PositionRegister: [Input.Index] + ) { + (pc, quantifiedPositions.popLast()!, stackEnd, captureEnds, intRegisters, posRegisters) + } + + var isEmpty: Bool { quantifiedPositions.isEmpty } + } func makeSavePoint( _ pc: InstructionAddress, addressOnly: Bool = false ) -> SavePoint { - SavePoint( + .basic(BasicSavePoint( pc: pc, pos: addressOnly ? nil : currentPosition, stackEnd: .init(callStack.count), captureEnds: storedCaptures, intRegisters: registers.ints, + posRegisters: registers.positions)) + } + + func startQuantifierSavePoint() -> QuantifierSavePoint { + // Restores to the instruction AFTER the current quantifier instruction + QuantifierSavePoint( + pc: controller.pc + 1, + quantifiedPositions: [], + stackEnd: .init(callStack.count), + captureEnds: storedCaptures, + intRegisters: registers.ints, posRegisters: registers.positions) } } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 96c307c75..733c31e42 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -9,6 +9,8 @@ // //===----------------------------------------------------------------------===// +@_implementationOnly import _RegexParser + extension Instruction { /// An instruction's payload packs operands and destination /// registers. @@ -239,6 +241,13 @@ extension Instruction.Payload { let isScalar = (val >> 14) & 1 == 1 return (cc, isStrict, isScalar) } + + init(quantify payload: QuantifyPayload) { + self.init(rawValue: payload.rawValue) + } + var quantify: QuantifyPayload { + QuantifyPayload.init(rawValue: self.rawValue & _payloadMask) + } init(consumer: ConsumeFunctionRegister) { self.init(consumer) @@ -355,3 +364,135 @@ extension Instruction.Payload { } } +struct QuantifyPayload: RawRepresentable { + let rawValue: UInt64 + + enum PayloadType: UInt64 { + case bitset = 0 + case asciiChar + case any + case builtin + } + + // The top 8 bits are reserved for the opcode so we have 56 bits to work with + // b55-b54 - Payload type (one of 4 types) + // b53-b37 - minTrips (16 bit int) + // b37-b20 - extraTrips (16 bit value, one bit for nil) + // b20-b16 - Quantification type (one of three types), should only use 2 bits of these + // b16-b0 - Payload value (depends on payload type) + static let quantKindShift: UInt64 = 16 + static let extraTripsShift: UInt64 = 20 + static let minTripsShift: UInt64 = 37 + static let typeShift: UInt64 = 54 + + static func packInfoValues( + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int?, + _ type: PayloadType + ) -> UInt64 { + let kindVal: UInt64 + switch kind { + case .eager: + kindVal = 0 + case .reluctant: + kindVal = 1 + case .possessive: + kindVal = 2 + } + let extraTripsVal: UInt64 = extraTrips == nil ? 1 : UInt64(extraTrips!) << 1 + return (kindVal << QuantifyPayload.quantKindShift) + + (extraTripsVal << QuantifyPayload.extraTripsShift) + + (UInt64(minTrips) << QuantifyPayload.minTripsShift) + + (type.rawValue << QuantifyPayload.typeShift) + } + + init(rawValue: UInt64) { + self.rawValue = rawValue + print("rawValue \(rawValue)") + assert(rawValue & _opcodeMask == 0) + } + + init( + bitset: AsciiBitsetRegister, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + assert(bitset.bits < 0xFF_FF) + self.rawValue = bitset.bits + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .bitset) + } + + init( + asciiChar: UInt8, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + self.rawValue = UInt64(asciiChar) + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .asciiChar) + } + + init( + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + self.rawValue = QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .any) + } + + init( + builtin: BuiltinCC, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + self.rawValue = builtin.rawValue + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .builtin) + } + + var type: PayloadType { + // future work: layout + switch (self.rawValue >> QuantifyPayload.typeShift) & 3 { + case 0: return .bitset + case 1: return .asciiChar + case 2: return .any + case 3: return .builtin + default: + fatalError("Unreachable") + } + } + + var quantKind: AST.Quantification.Kind { + switch (self.rawValue >> QuantifyPayload.quantKindShift) & 3 { + case 0: return .eager + case 1: return .reluctant + case 2: return .possessive + default: + fatalError("Unreachable") + } + } + + var minTrips: UInt64 { + (self.rawValue >> QuantifyPayload.minTripsShift) & 0xFF_FF + } + + var extraTrips: UInt64? { + let val = (self.rawValue >> QuantifyPayload.extraTripsShift) & 0x1FF_FF + if val == 1 { + return nil + } else { + return val >> 1 + } + } + + var bitset: AsciiBitsetRegister { + TypedInt(self.rawValue & 0xFF_FF) + } + + var asciiChar: UInt8 { + UInt8(asserting: self.rawValue & 0xFF) + } + + var builtin: BuiltinCC { + BuiltinCC(rawValue: self.rawValue & 0xFF_FF)! + } +} diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index b0be7d4fd..c696e87c3 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -189,6 +189,13 @@ extension Instruction { /// case splitSaving + /// Fused quantify, execute, save instruction + /// Quantifies the stored instruction in an inner loop instead of looping through instructions in processor + /// Only quantifies specific nodes + /// + /// quantify(_:QuantifyPayload) + /// + case quantify /// Begin the given capture /// /// beginCapture(_:CapReg) diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index b0735c160..fb23faa55 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -205,6 +205,49 @@ extension MEProgram.Builder { .init(assertion: payload))) } + mutating func buildQuantify( + bitset: DSLTree.CustomCharacterClass.AsciiBitset, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + instructions.append(.init( + .quantify, + .init(quantify: .init(bitset: makeAsciiBitset(bitset), kind, minTrips, extraTrips)))) + } + + mutating func buildQuantify( + asciiChar: UInt8, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + instructions.append(.init( + .quantify, + .init(quantify: .init(asciiChar: asciiChar, kind, minTrips, extraTrips)))) + } + + mutating func buildQuantifyAny( + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + instructions.append(.init( + .quantify, + .init(quantify: .init(kind, minTrips, extraTrips)))) + } + + mutating func buildQuantify( + builtin: BuiltinCC, + _ kind: AST.Quantification.Kind, + _ minTrips: Int, + _ extraTrips: Int? + ) { + instructions.append(.init( + .quantify, + .init(quantify: .init(builtin: builtin, kind, minTrips, extraTrips)))) + } + mutating func buildAccept() { instructions.append(.init(.accept)) } diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index f79e8f463..824f01078 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -1,13 +1,13 @@ @_implementationOnly import _RegexParser // For AssertionKind extension Processor { - mutating func matchBuiltin( + @inline(__always) + mutating func _doMatchBuiltin( _ cc: BuiltinCC, _ isStrictAscii: Bool - ) -> Bool { + ) -> (Bool, Input.Index?) { guard let c = load() else { - signalFailure() - return false + return (false, nil) } var matched: Bool @@ -32,9 +32,16 @@ extension Processor { case .word: matched = c.isWordCharacter && (c.isASCII || !isStrictAscii) } - + return (matched, next) + } + + mutating func matchBuiltin( + _ cc: BuiltinCC, + _ isStrictAscii: Bool + ) -> Bool { + let (matched, next) = _doMatchBuiltin(cc, isStrictAscii) if matched { - currentPosition = next + currentPosition = next! return true } else { signalFailure() diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 5f56ca881..50b482e81 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -244,18 +244,26 @@ extension Processor { currentPosition < end ? input.unicodeScalars[currentPosition] : nil } + func _doMatchScalar(_ s: Unicode.Scalar, _ boundaryCheck: Bool) -> (Bool, Input.Index?) { + if s == loadScalar(), + let idx = input.unicodeScalars.index( + currentPosition, + offsetBy: 1, + limitedBy: end), + (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) { + return (true, idx) + } else { + return (false, nil) + } + } + mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { - guard s == loadScalar(), - let idx = input.unicodeScalars.index( - currentPosition, - offsetBy: 1, - limitedBy: end), - (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) - else { + let (matched, next) = _doMatchScalar(s, boundaryCheck) + guard matched else { signalFailure() return false } - currentPosition = idx + currentPosition = next! return true } @@ -278,13 +286,22 @@ extension Processor { return true } + @inline(__always) + func _doMatchBitset(_ bitset: DSLTree.CustomCharacterClass.AsciiBitset) -> Bool { + if let cur = load(), bitset.matches(char: cur) { + return true + } else { + return false + } + } + // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset mutating func matchBitset( _ bitset: DSLTree.CustomCharacterClass.AsciiBitset ) -> Bool { - guard let cur = load(), bitset.matches(char: cur) else { + guard _doMatchBitset(bitset) else { signalFailure() return false } @@ -306,13 +323,115 @@ extension Processor { return true } + mutating func runQuantify(_ payload: QuantifyPayload) { + var trips = 0 + var extraTrips = payload.extraTrips + var savePoint = startQuantifierSavePoint() + + // Initialize values + // lily note: I hope swift/llvm is smart enough to recognize the code paths + // and elide the unwrapping checks, but I'm not sure + let bitset: DSLTree.CustomCharacterClass.AsciiBitset? + switch payload.type { + case .bitset: + bitset = registers[payload.bitset] + default: + bitset = nil + } + let scalar: UnicodeScalar? + switch payload.type { + case .asciiChar: + scalar = UnicodeScalar.init(_value: UInt32(payload.asciiChar)) + default: scalar = nil + } + let builtin: BuiltinCC? + switch payload.type { + case .builtin: + builtin = payload.builtin + default: + builtin = nil + } + + print("running quantify") + while extraTrips ?? 1 > 0 { + print("in quantify \(trips) \(extraTrips) \(load())") + let res: Bool + var next: Input.Index? = input.index(after: currentPosition) + switch payload.type { + case .bitset: + res = _doMatchBitset(bitset!) + case .asciiChar: + // lily note: should this just be match character since we already + // have next index? we always do the boundary check after all, why recompute + (res, next) = _doMatchScalar(scalar!, true) + case .builtin: + // We only emit .quantify if it is non-strict ascii + (res, next) = _doMatchBuiltin(builtin!, false) + case .any: + res = true + } + + guard res else { break } // goto exit-policy + + currentPosition = next! + extraTrips = extraTrips.map({$0 - 1}) + trips += 1 + + switch payload.quantKind { + case .eager: + savePoint.quantifiedPositions.append(currentPosition) + case .reluctant: + // lily note: maybe I should do this check earlier, but itll + // mean lots and lots of fatal errors... hmmm + // maybe a new quantkind type that only has those two so i can drop + // that extra bit entirely + fatalError("Unreachable") + case .possessive: + continue + } + } + + print("exit policy") + // --- exit policy + if trips < payload.minTrips { + signalFailure() + return + } + // emit save point + if trips > 0 { + if payload.quantKind == .eager { + savePoints.append(.quant(savePoint)) + } else { + // Possessive, just emit one save point + savePoints.append(makeSavePoint(controller.pc + 1)) + } + } + } + mutating func signalFailure() { - guard let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = - savePoints.popLast()?.destructure - else { + guard let savePoint = savePoints.popLast() else { state = .fail return } + let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters): ( + pc: InstructionAddress, + pos: Position?, + stackEnd: CallStackAddress, + captureEnds: [_StoredCapture], + intRegisters: [Int], + PositionRegister: [Input.Index] + ) + switch savePoint { + case .basic(let sp): + (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = sp.destructure + case .quant(var sp): + (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = sp.pop() + // Add back the quantifier save point if it still has more elements + if !sp.isEmpty { + savePoints.append(.quant(sp)) + } + } + assert(stackEnd.rawValue <= callStack.count) assert(capEnds.count == storedCaptures.count) @@ -488,6 +607,10 @@ extension Processor { controller.step() } } + case .quantify: + let quant = payload.quantify + runQuantify(quant) + controller.step() case .consumeBy: let reg = payload.consumer diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index 525beec63..74a218180 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -54,6 +54,17 @@ extension Instruction.Payload: CustomStringConvertible { } extension Processor.SavePoint { + func describe(in input: String) -> String { + switch self { + case .basic(let basicSavePoint): + return "BasicSP(\(basicSavePoint.describe(in: input)))" + case .quant(let quantifierSavePoint): + return "QuantSP(\(quantifierSavePoint.describe(in: input)))" + } + } +} + +extension Processor.BasicSavePoint { func describe(in input: String) -> String { let posStr: String if let p = self.pos { @@ -66,3 +77,12 @@ extension Processor.SavePoint { """ } } + +extension Processor.QuantifierSavePoint { + func describe(in input: String) -> String { + let posStr = "\(input.distance(from: input.startIndex, to: quantifiedPositions.first!)) to \(input.distance(from: input.startIndex, to: quantifiedPositions.last!))" + return """ + pc: \(self.pc), posRange: \(posStr), stackEnd: \(stackEnd) + """ + } +} diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 6c8f66e10..99da8b708 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -37,6 +37,7 @@ enum DecodedInstr { case matchScalarUnchecked case matchBitsetScalar case matchBitset + case matchBuiltin case consumeBy case assertBy case matchBy @@ -45,8 +46,7 @@ enum DecodedInstr { case endCapture case transformCapture case captureValue - case builtinAssertion - case builtinCharacterClass + case quantify } extension DecodedInstr { @@ -116,7 +116,7 @@ extension DecodedInstr { return .matchBitset } case .consumeBy: - return consumeBy + return .consumeBy case .assertBy: return .assertBy case .matchBy: @@ -131,11 +131,11 @@ extension DecodedInstr { return .transformCapture case .captureValue: return .captureValue - case .builtinAssertion: - return .builtinAssertion - case .builtinCharacterClass: - return .builtinCharacterClass -} + case .quantify: + return .quantify + case .matchBuiltin: + return .matchBuiltin + } } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index f2715eac1..9cbcbbcc9 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1956,4 +1956,9 @@ extension RegexTests { expectCompletion(regex: #"(a{,4})*"#, in: "aa") expectCompletion(regex: #"((|)+)*"#, in: "aa") } + + func testFastQuant() throws { + let r = try Regex("a*") + try r.firstMatch(in: "aab") + } } From f401e84e550d613eaaafa8c6c557d666fe0fd71a Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 12 Jul 2022 17:35:20 -0700 Subject: [PATCH 11/35] Fix tests --- Tests/RegexTests/CompileTests.swift | 154 ++++++++++++++-------------- Tests/RegexTests/MatchTests.swift | 11 +- 2 files changed, 79 insertions(+), 86 deletions(-) diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 6c8f66e10..a7c0ee531 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -37,6 +37,7 @@ enum DecodedInstr { case matchScalarUnchecked case matchBitsetScalar case matchBitset + case matchBuiltin case consumeBy case assertBy case matchBy @@ -45,8 +46,6 @@ enum DecodedInstr { case endCapture case transformCapture case captureValue - case builtinAssertion - case builtinCharacterClass } extension DecodedInstr { @@ -55,87 +54,84 @@ extension DecodedInstr { /// /// Must stay in sync with Processor.cycle static func decode(_ instruction: Instruction) -> DecodedInstr { - let (opcode, payload) = instruction.destructure - - switch opcode { - case .invalid: - fatalError("Invalid program") - case .moveImmediate: - return .moveImmediate - case .moveCurrentPosition: - return .moveCurrentPosition - case .branch: - return .branch - case .condBranchZeroElseDecrement: - return .condBranchZeroElseDecrement - case .condBranchSamePosition: - return .condBranchSamePosition - case .save: - return .save - case .saveAddress: - return .saveAddress - case .splitSaving: - return .splitSaving - case .clear: - return .clear - case .clearThrough: - return .clearThrough - case .accept: - return .accept - case .fail: - return .fail - case .advance: - return .advance - case .match: - let (isCaseInsensitive, _) = payload.elementPayload - if isCaseInsensitive { - return .matchCaseInsensitive - } else { - return .match - } - case .matchScalar: - let (_, caseInsensitive, boundaryCheck) = payload.scalarPayload - if caseInsensitive { - if boundaryCheck { - return .matchScalarCaseInsensitive - } else { - return .matchScalarCaseInsensitiveUnchecked - } + let (opcode, payload) = instruction.destructure + switch opcode { + case .invalid: + fatalError("Invalid program") + case .moveImmediate: + return .moveImmediate + case .moveCurrentPosition: + return .moveCurrentPosition + case .branch: + return .branch + case .condBranchZeroElseDecrement: + return .condBranchZeroElseDecrement + case .condBranchSamePosition: + return .condBranchSamePosition + case .save: + return .save + case .saveAddress: + return .saveAddress + case .splitSaving: + return .splitSaving + case .clear: + return .clear + case .clearThrough: + return .clearThrough + case .accept: + return .accept + case .fail: + return .fail + case .advance: + return .advance + case .match: + let (isCaseInsensitive, _) = payload.elementPayload + if isCaseInsensitive { + return .matchCaseInsensitive + } else { + return .match + } + case .matchScalar: + let (_, caseInsensitive, boundaryCheck) = payload.scalarPayload + if caseInsensitive { + if boundaryCheck { + return .matchScalarCaseInsensitive } else { - if boundaryCheck { - return .matchScalar - } else { - return .matchScalarUnchecked - } + return .matchScalarCaseInsensitiveUnchecked } - case .matchBitset: - let (isScalar, _) = payload.bitsetPayload - if isScalar { - return .matchBitsetScalar + } else { + if boundaryCheck { + return .matchScalar } else { - return .matchBitset + return .matchScalarUnchecked } - case .consumeBy: - return consumeBy - case .assertBy: - return .assertBy - case .matchBy: - return .matchBy - case .backreference: - return .backreference - case .beginCapture: - return .beginCapture - case .endCapture: - return .endCapture - case .transformCapture: - return .transformCapture - case .captureValue: - return .captureValue - case .builtinAssertion: - return .builtinAssertion - case .builtinCharacterClass: - return .builtinCharacterClass -} + } + case .matchBitset: + let (isScalar, _) = payload.bitsetPayload + if isScalar { + return .matchBitsetScalar + } else { + return .matchBitset + } + case .consumeBy: + return consumeBy + case .assertBy: + return .assertBy + case .matchBy: + return .matchBy + case .backreference: + return .backreference + case .beginCapture: + return .beginCapture + case .endCapture: + return .endCapture + case .transformCapture: + return .transformCapture + case .captureValue: + return .captureValue + case .matchBuiltin: + return .matchBuiltin + } } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index f2715eac1..377d5a7be 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1072,13 +1072,10 @@ extension RegexTests { ("123 456", "23")) #endif - // TODO: \G and \K - do { - let regex = try Regex(#"\Gab"#, as: Substring.self) - XCTExpectFailure { - XCTAssertEqual("abab".matches(of: regex).map(\.output), ["ab", "ab"]) - } - } + // \G and \K + let regex = try Regex(#"\Gab"#, as: Substring.self) + XCTAssertEqual("abab".matches(of: regex).map(\.output), ["ab", "ab"]) + // TODO: Oniguruma \y and \Y firstMatchTests( From b09f45fc8e624674307adcc04023ce16c2c26e4f Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 12 Jul 2022 17:37:55 -0700 Subject: [PATCH 12/35] Update opcode description for assertBy --- Sources/_StringProcessing/Engine/Instruction.swift | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index b0be7d4fd..ac63dc7f5 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -123,16 +123,12 @@ extension Instruction { /// Operand: Consume function register to call. case consumeBy - /// Custom lookaround assertion operation. - /// Triggers a failure if customFunction returns false. + /// Lookaround assertion operation. Performs a zero width assertion based on + /// the assertion type and options stored in the payload /// - /// assert(_ customFunction: ( - /// input: Input, - /// currentPos: Position, - /// bounds: Range - /// ) -> Bool) + /// assert(_:AssertionPayload) /// - /// Operands: destination bool register, assert hook register + /// Operands: AssertionPayload containing assertion type and options case assertBy /// Custom value-creating consume operation. From 00ae70b1e674a5c2225e1e29ab9223b2333d9a9a Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Wed, 13 Jul 2022 12:21:58 -0700 Subject: [PATCH 13/35] Bugfixes --- Sources/_StringProcessing/ByteCodeGen.swift | 1 + Sources/_StringProcessing/Compiler.swift | 4 +- .../Engine/InstPayload.swift | 1 - .../_StringProcessing/Engine/MECapture.swift | 6 -- .../_StringProcessing/Engine/Processor.swift | 84 +++++++++++-------- Tests/RegexTests/MatchTests.swift | 12 ++- 6 files changed, 64 insertions(+), 44 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 4c6cb5893..563125bb0 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -476,6 +476,7 @@ fileprivate extension Compiler.ByteCodeGen { options.matchLevel == .graphemeCluster && updatedKind != .reluctant { emitFastQuant(child, updatedKind, minTrips, extraTrips) + return } // The below is a general algorithm for bounded and unbounded diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 530126a32..47cf33e6a 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -38,7 +38,9 @@ class Compiler { compileOptions: compileOptions, captureList: tree.captureList) - return try codegen.emitRoot(tree.root) + let p = try codegen.emitRoot(tree.root) + print(p) + return p } } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 733c31e42..99cb93885 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -409,7 +409,6 @@ struct QuantifyPayload: RawRepresentable { init(rawValue: UInt64) { self.rawValue = rawValue - print("rawValue \(rawValue)") assert(rawValue & _opcodeMask == 0) } diff --git a/Sources/_StringProcessing/Engine/MECapture.swift b/Sources/_StringProcessing/Engine/MECapture.swift index 53243cd34..4bea21133 100644 --- a/Sources/_StringProcessing/Engine/MECapture.swift +++ b/Sources/_StringProcessing/Engine/MECapture.swift @@ -88,12 +88,6 @@ extension Processor { } } -extension Processor._StoredCapture: CustomStringConvertible { - var description: String { - return String(describing: self) - } -} - struct MECaptureList { var values: Array var referencedCaptureOffsets: [ReferenceID: Int] diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 50b482e81..e440ad6d9 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -245,14 +245,17 @@ extension Processor { } func _doMatchScalar(_ s: Unicode.Scalar, _ boundaryCheck: Bool) -> (Bool, Input.Index?) { + // print("doing match scalar \(s)") if s == loadScalar(), let idx = input.unicodeScalars.index( currentPosition, offsetBy: 1, limitedBy: end), (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) { + // print("matched") return (true, idx) } else { + // print("did not match") return (false, nil) } } @@ -323,14 +326,14 @@ extension Processor { return true } - mutating func runQuantify(_ payload: QuantifyPayload) { + mutating func runQuantify(_ payload: QuantifyPayload) -> Bool { var trips = 0 var extraTrips = payload.extraTrips var savePoint = startQuantifierSavePoint() // Initialize values // lily note: I hope swift/llvm is smart enough to recognize the code paths - // and elide the unwrapping checks, but I'm not sure + // and elide the unwrapping checks in the hot loop, but I'm not sure let bitset: DSLTree.CustomCharacterClass.AsciiBitset? switch payload.type { case .bitset: @@ -351,65 +354,74 @@ extension Processor { default: builtin = nil } - + + if payload.minTrips == 0 { + // exit policy + extraTrips = extraTrips.map({$0 - 1}) + if payload.quantKind == .eager { + savePoint.quantifiedPositions.append(currentPosition) + } + + } + print("running quantify") - while extraTrips ?? 1 > 0 { - print("in quantify \(trips) \(extraTrips) \(load())") + while true { + print("in quantify \(trips) \(extraTrips) \(load()) \(payload.type) \(scalar)") + // fixme: maybe the _do methods should always return the next index, lets + // us remove the res variable entirely. + // dunno how thatll affect the normal matching instructions tho let res: Bool - var next: Input.Index? = input.index(after: currentPosition) + var next: Input.Index? switch payload.type { case .bitset: res = _doMatchBitset(bitset!) + next = res ? input.index(after: currentPosition) : nil case .asciiChar: - // lily note: should this just be match character since we already - // have next index? we always do the boundary check after all, why recompute (res, next) = _doMatchScalar(scalar!, true) case .builtin: // We only emit .quantify if it is non-strict ascii (res, next) = _doMatchBuiltin(builtin!, false) case .any: - res = true + // // print("\(input.distance(from: currentPosition, to: input.endIndex))") + res = currentPosition != input.endIndex + next = res ? input.index(after: currentPosition) : nil } - guard res else { break } // goto exit-policy + guard res else { break } // goto exit currentPosition = next! - extraTrips = extraTrips.map({$0 - 1}) trips += 1 - switch payload.quantKind { - case .eager: + // min-trips control block + if trips < payload.minTrips { continue } // goto loop + + // exit policy + if extraTrips == 0 { break } // goto exit + extraTrips = extraTrips.map({$0 - 1}) + if payload.quantKind == .eager { savePoint.quantifiedPositions.append(currentPosition) - case .reluctant: - // lily note: maybe I should do this check earlier, but itll - // mean lots and lots of fatal errors... hmmm - // maybe a new quantkind type that only has those two so i can drop - // that extra bit entirely - fatalError("Unreachable") - case .possessive: - continue } } - print("exit policy") // --- exit policy if trips < payload.minTrips { + print("failed to quantify to minTrips, signalling failure") signalFailure() - return + return false } - // emit save point - if trips > 0 { - if payload.quantKind == .eager { - savePoints.append(.quant(savePoint)) - } else { - // Possessive, just emit one save point - savePoints.append(makeSavePoint(controller.pc + 1)) - } + + print("Exiting quantify") + if payload.quantKind == .eager && !savePoint.isEmpty { + // print("appending eager sp") + savePoints.append(.quant(savePoint)) } + return true } mutating func signalFailure() { + // print("signal failure") guard let savePoint = savePoints.popLast() else { + // print("no save points? faililng") state = .fail return } @@ -424,10 +436,13 @@ extension Processor { switch savePoint { case .basic(let sp): (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = sp.destructure + // print("basic sp, restoring to \(pc) \(input.distance(from: input.startIndex, to: pos!))") case .quant(var sp): (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = sp.pop() + // print("restoring quant sp to pc \(pc) \(input.distance(from: input.startIndex, to: pos!))") // Add back the quantifier save point if it still has more elements if !sp.isEmpty { + // print("adding it back") savePoints.append(.quant(sp)) } } @@ -485,7 +500,7 @@ extension Processor { _checkInvariants() } let (opcode, payload) = fetch().destructure - + // print("cycle \(currentPC) \(opcode)") switch opcode { case .invalid: fatalError("Invalid program") @@ -609,8 +624,9 @@ extension Processor { } case .quantify: let quant = payload.quantify - runQuantify(quant) - controller.step() + if runQuantify(quant) { + controller.step() + } case .consumeBy: let reg = payload.consumer diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 9cbcbbcc9..d499c8c62 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1958,7 +1958,15 @@ extension RegexTests { } func testFastQuant() throws { - let r = try Regex("a*") - try r.firstMatch(in: "aab") + firstMatchTest( + #"a{,2}x"#, input: "123aaaxyz", match: "aax") +// firstMatchTest( +// #"a{1,2}"#, input: "123aaaxyz", match: "aa") +// var r = try! Regex("a|b+|c*", as: Substring.self) +// let matches = "aaabbbccc".matches(of: r) +// print(matches.map(\.output)) +// r._setCompilerOptionsForTesting(.disableOptimizations) +// let unoptMatches = "aaabbbccc".matches(of: r) +// print(unoptMatches.map(\.output)) } } From 62bec3f845b98d33f45c44929d7b5e572c6b9669 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Wed, 13 Jul 2022 12:27:35 -0700 Subject: [PATCH 14/35] Finish bugfixes --- Sources/_StringProcessing/Compiler.swift | 2 +- Sources/_StringProcessing/Engine/Processor.swift | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 47cf33e6a..5b4ad2b28 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -39,7 +39,7 @@ class Compiler { compileOptions, captureList: tree.captureList) let p = try codegen.emitRoot(tree.root) - print(p) + // print(p) return p } } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index e440ad6d9..c8078e99c 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -364,9 +364,9 @@ extension Processor { } - print("running quantify") + // print("running quantify") while true { - print("in quantify \(trips) \(extraTrips) \(load()) \(payload.type) \(scalar)") + // print("in quantify \(trips) \(extraTrips) \(load()) \(payload.type) \(scalar)") // fixme: maybe the _do methods should always return the next index, lets // us remove the res variable entirely. // dunno how thatll affect the normal matching instructions tho @@ -383,7 +383,7 @@ extension Processor { (res, next) = _doMatchBuiltin(builtin!, false) case .any: // // print("\(input.distance(from: currentPosition, to: input.endIndex))") - res = currentPosition != input.endIndex + res = currentPosition != input.endIndex && !input[currentPosition].isNewline next = res ? input.index(after: currentPosition) : nil } @@ -405,12 +405,12 @@ extension Processor { // --- exit policy if trips < payload.minTrips { - print("failed to quantify to minTrips, signalling failure") + // print("failed to quantify to minTrips, signalling failure") signalFailure() return false } - print("Exiting quantify") + // print("Exiting quantify") if payload.quantKind == .eager && !savePoint.isEmpty { // print("appending eager sp") savePoints.append(.quant(savePoint)) From 8cf6b219228657fe8d11ca66a42f58c6126a4bf0 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Wed, 13 Jul 2022 14:48:46 -0700 Subject: [PATCH 15/35] Fixed array copy issue with savepoints --- .../Engine/Backtracking.swift | 54 +++++++------------ .../_StringProcessing/Engine/MEBuiltins.swift | 1 - .../_StringProcessing/Engine/Processor.swift | 31 +++++------ .../_StringProcessing/Engine/Tracing.swift | 26 ++------- 4 files changed, 35 insertions(+), 77 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Backtracking.swift b/Sources/_StringProcessing/Engine/Backtracking.swift index a92477020..a178a82eb 100644 --- a/Sources/_StringProcessing/Engine/Backtracking.swift +++ b/Sources/_StringProcessing/Engine/Backtracking.swift @@ -10,22 +10,11 @@ //===----------------------------------------------------------------------===// extension Processor { - enum SavePoint { - case basic(BasicSavePoint) - case quant(QuantifierSavePoint) - - var pc: InstructionAddress { - switch self { - case .basic(let sp): return sp.pc - case .quant(let sp): return sp.pc - } - } - } - - struct BasicSavePoint { + struct SavePoint { var pc: InstructionAddress var pos: Position? - + // Quantifiers may store many positions to restore to + var additionalPositions: [Position] // The end of the call stack, so we can slice it off // when failing inside a call. // @@ -51,51 +40,44 @@ extension Processor { intRegisters: [Int], PositionRegister: [Input.Index] ) { - (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters) + assert(additionalPositions.isEmpty) + return (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters) } - } - - struct QuantifierSavePoint { - var pc: InstructionAddress - var quantifiedPositions: [Position] - var stackEnd: CallStackAddress - var captureEnds: [_StoredCapture] - var intRegisters: [Int] - var posRegisters: [Input.Index] - - - mutating func pop() -> ( + + var isEmpty: Bool { additionalPositions.isEmpty } + + mutating func removeLast() -> ( pc: InstructionAddress, - pos: Position, + pos: Position?, stackEnd: CallStackAddress, captureEnds: [_StoredCapture], intRegisters: [Int], PositionRegister: [Input.Index] ) { - (pc, quantifiedPositions.popLast()!, stackEnd, captureEnds, intRegisters, posRegisters) + (pc, additionalPositions.removeLast(), stackEnd, captureEnds, intRegisters, posRegisters) } - - var isEmpty: Bool { quantifiedPositions.isEmpty } } func makeSavePoint( _ pc: InstructionAddress, addressOnly: Bool = false ) -> SavePoint { - .basic(BasicSavePoint( + SavePoint( pc: pc, pos: addressOnly ? nil : currentPosition, + additionalPositions: [], stackEnd: .init(callStack.count), captureEnds: storedCaptures, intRegisters: registers.ints, - posRegisters: registers.positions)) + posRegisters: registers.positions) } - func startQuantifierSavePoint() -> QuantifierSavePoint { + func startQuantifierSavePoint() -> SavePoint { // Restores to the instruction AFTER the current quantifier instruction - QuantifierSavePoint( + SavePoint( pc: controller.pc + 1, - quantifiedPositions: [], + pos: nil, + additionalPositions: [], stackEnd: .init(callStack.count), captureEnds: storedCaptures, intRegisters: registers.ints, diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 824f01078..d677784b2 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -1,7 +1,6 @@ @_implementationOnly import _RegexParser // For AssertionKind extension Processor { - @inline(__always) mutating func _doMatchBuiltin( _ cc: BuiltinCC, _ isStrictAscii: Bool diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index c8078e99c..196ae13fe 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -289,7 +289,7 @@ extension Processor { return true } - @inline(__always) + func _doMatchBitset(_ bitset: DSLTree.CustomCharacterClass.AsciiBitset) -> Bool { if let cur = load(), bitset.matches(char: cur) { return true @@ -359,7 +359,7 @@ extension Processor { // exit policy extraTrips = extraTrips.map({$0 - 1}) if payload.quantKind == .eager { - savePoint.quantifiedPositions.append(currentPosition) + savePoint.additionalPositions.append(currentPosition) } } @@ -399,7 +399,7 @@ extension Processor { if extraTrips == 0 { break } // goto exit extraTrips = extraTrips.map({$0 - 1}) if payload.quantKind == .eager { - savePoint.quantifiedPositions.append(currentPosition) + savePoint.additionalPositions.append(currentPosition) } } @@ -413,15 +413,13 @@ extension Processor { // print("Exiting quantify") if payload.quantKind == .eager && !savePoint.isEmpty { // print("appending eager sp") - savePoints.append(.quant(savePoint)) + savePoints.append(savePoint) } return true } mutating func signalFailure() { - // print("signal failure") - guard let savePoint = savePoints.popLast() else { - // print("no save points? faililng") + guard var savePoint = savePoints.popLast() else { state = .fail return } @@ -433,18 +431,13 @@ extension Processor { intRegisters: [Int], PositionRegister: [Input.Index] ) - switch savePoint { - case .basic(let sp): - (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = sp.destructure - // print("basic sp, restoring to \(pc) \(input.distance(from: input.startIndex, to: pos!))") - case .quant(var sp): - (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = sp.pop() - // print("restoring quant sp to pc \(pc) \(input.distance(from: input.startIndex, to: pos!))") - // Add back the quantifier save point if it still has more elements - if !sp.isEmpty { - // print("adding it back") - savePoints.append(.quant(sp)) + if !savePoint.isEmpty { + (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoint.removeLast() + if !savePoint.isEmpty { + savePoints.append(savePoint) } + } else { + (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoint.destructure } assert(stackEnd.rawValue <= callStack.count) @@ -500,7 +493,7 @@ extension Processor { _checkInvariants() } let (opcode, payload) = fetch().destructure - // print("cycle \(currentPC) \(opcode)") + // print("cycle \(currentPC) \(opcode) | pos \(input.distance(from: input.startIndex, to: currentPosition))") switch opcode { case .invalid: fatalError("Invalid program") diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index 74a218180..a829e9d27 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -54,35 +54,19 @@ extension Instruction.Payload: CustomStringConvertible { } extension Processor.SavePoint { - func describe(in input: String) -> String { - switch self { - case .basic(let basicSavePoint): - return "BasicSP(\(basicSavePoint.describe(in: input)))" - case .quant(let quantifierSavePoint): - return "QuantSP(\(quantifierSavePoint.describe(in: input)))" - } - } -} - -extension Processor.BasicSavePoint { func describe(in input: String) -> String { let posStr: String if let p = self.pos { posStr = "\(input.distance(from: input.startIndex, to: p))" } else { - posStr = "" + if additionalPositions.isEmpty { + posStr = "" + } else { + posStr = "\(additionalPositions.map { p in input.distance(from: input.startIndex, to: p) })" + } } return """ pc: \(self.pc), pos: \(posStr), stackEnd: \(stackEnd) """ } } - -extension Processor.QuantifierSavePoint { - func describe(in input: String) -> String { - let posStr = "\(input.distance(from: input.startIndex, to: quantifiedPositions.first!)) to \(input.distance(from: input.startIndex, to: quantifiedPositions.last!))" - return """ - pc: \(self.pc), posRange: \(posStr), stackEnd: \(stackEnd) - """ - } -} From 8d61e7dc2b9ae8360d4942e7c395b01a65be8914 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Wed, 13 Jul 2022 15:31:21 -0700 Subject: [PATCH 16/35] Add assertions + cleanup --- Sources/_StringProcessing/ByteCodeGen.swift | 36 ++++++++++++++++--- .../Engine/InstPayload.swift | 8 +++-- .../_StringProcessing/Engine/Processor.swift | 21 +++++------ 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 563125bb0..243117072 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -670,25 +670,46 @@ fileprivate extension Compiler.ByteCodeGen { _ extraTrips: Int? ) { // These cases must stay in sync with DSLTree.Node.shouldDoFastQuant - // as well as the compilation paths for these nodes outside of quantification + // as well as the compilation paths for these nodes outside of quantification\ + + // All assumptions made by the processor in runQuantify() must be checked here + // If an error is thrown here, there must be a mistake in shouldDoFastQuant + // letting in an invalid case // Coupling is bad but we do it for _speed_ switch child { case .customCharacterClass(let ccc): - builder.buildQuantify(bitset: ccc.asAsciiBitset(options)!, kind, minTrips, extraTrips) + if let bitset = ccc.asAsciiBitset(options) { + builder.buildQuantify(bitset: bitset, kind, minTrips, extraTrips) + } else { + fatalError("Entered emitFastQuant with an invalid case: Unable to generate bitset") + } case .atom(let atom): switch atom { case .char(let c): - builder.buildQuantify(asciiChar: c._singleScalarAsciiValue!, kind, minTrips, extraTrips) + if let val = c._singleScalarAsciiValue { + builder.buildQuantify(asciiChar: val, kind, minTrips, extraTrips) + } else { + fatalError("Entered emitFastQuant with an invalid case: Character is not single scalar ascii") + } case .any: + assert(!options.dotMatchesNewline, "Entered emitFastQuant with an invalid case: Any matches newlines") builder.buildQuantifyAny(kind, minTrips, extraTrips) case .unconverted(let astAtom): - builder.buildQuantify(builtin: astAtom.ast.characterClass!.builtinCC!, kind, minTrips, extraTrips) + if let builtin = astAtom.ast.characterClass?.builtinCC { + assert(!builtin.isStrict(options: options), "Entered emitFastQuant with an invalid case: Strict builtin character class") + builder.buildQuantify(builtin: builtin, kind, minTrips, extraTrips) + } else { + fatalError("Entered emitFastQuant with an invalid case: Not a builtin character class") + } default: fatalError("Entered emitFastQuant with an invalid case: DSLTree.Node.shouldDoFastQuant is out of sync") } case .convertedRegexLiteral(let node, _): emitFastQuant(node, kind, minTrips, extraTrips) + case .nonCapturingGroup(let groupKind, let node): + assert(groupKind.ast == .nonCapture, "Entered emitFastQuant with an invalid case: Invalid nonCapturingGroup type") + emitFastQuant(node, kind, minTrips, extraTrips) default: fatalError("Entered emitFastQuant with an invalid case: DSLTree.Node.shouldDoFastQuant is out of sync") } @@ -871,6 +892,13 @@ extension DSLTree.Node { } case .convertedRegexLiteral(let node, _): return node.shouldDoFastQuant(opts) + case .nonCapturingGroup(let kind, let child): + switch kind.ast { + case .nonCapture: + return child.shouldDoFastQuant(opts) + default: + return false + } default: return false } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 99cb93885..c81fca76d 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -374,13 +374,15 @@ struct QuantifyPayload: RawRepresentable { case builtin } + // Future work: optimize this layout -> payload type should be a fast switch // The top 8 bits are reserved for the opcode so we have 56 bits to work with // b55-b54 - Payload type (one of 4 types) // b53-b37 - minTrips (16 bit int) // b37-b20 - extraTrips (16 bit value, one bit for nil) - // b20-b16 - Quantification type (one of three types), should only use 2 bits of these + // b20-b18 - Quantification type (one of three types) + // b18-b16 - Unused // b16-b0 - Payload value (depends on payload type) - static let quantKindShift: UInt64 = 16 + static let quantKindShift: UInt64 = 18 static let extraTripsShift: UInt64 = 20 static let minTripsShift: UInt64 = 37 static let typeShift: UInt64 = 54 @@ -445,11 +447,11 @@ struct QuantifyPayload: RawRepresentable { _ minTrips: Int, _ extraTrips: Int? ) { + assert(builtin.rawValue < 0xFF_FF) self.rawValue = builtin.rawValue + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .builtin) } var type: PayloadType { - // future work: layout switch (self.rawValue >> QuantifyPayload.typeShift) & 3 { case 0: return .bitset case 1: return .asciiChar diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 196ae13fe..9f2728ce4 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -289,7 +289,6 @@ extension Processor { return true } - func _doMatchBitset(_ bitset: DSLTree.CustomCharacterClass.AsciiBitset) -> Bool { if let cur = load(), bitset.matches(char: cur) { return true @@ -355,21 +354,22 @@ extension Processor { builtin = nil } + // fixme: is there a way structure the loops so we don't duplicate the + // exit policy here? if payload.minTrips == 0 { // exit policy extraTrips = extraTrips.map({$0 - 1}) if payload.quantKind == .eager { savePoint.additionalPositions.append(currentPosition) } - } - // print("running quantify") + // > loop while true { - // print("in quantify \(trips) \(extraTrips) \(load()) \(payload.type) \(scalar)") // fixme: maybe the _do methods should always return the next index, lets // us remove the res variable entirely. - // dunno how thatll affect the normal matching instructions tho + // dunno how thatll affect the normal matching instructions tho, I wanted + // to leave the normal matching as untouched as possible let res: Bool var next: Input.Index? switch payload.type { @@ -382,7 +382,6 @@ extension Processor { // We only emit .quantify if it is non-strict ascii (res, next) = _doMatchBuiltin(builtin!, false) case .any: - // // print("\(input.distance(from: currentPosition, to: input.endIndex))") res = currentPosition != input.endIndex && !input[currentPosition].isNewline next = res ? input.index(after: currentPosition) : nil } @@ -392,10 +391,10 @@ extension Processor { currentPosition = next! trips += 1 - // min-trips control block + // > min-trips control block if trips < payload.minTrips { continue } // goto loop - // exit policy + // > exit-policy control block if extraTrips == 0 { break } // goto exit extraTrips = extraTrips.map({$0 - 1}) if payload.quantKind == .eager { @@ -403,16 +402,13 @@ extension Processor { } } - // --- exit policy + // > exit if trips < payload.minTrips { - // print("failed to quantify to minTrips, signalling failure") signalFailure() return false } - // print("Exiting quantify") if payload.quantKind == .eager && !savePoint.isEmpty { - // print("appending eager sp") savePoints.append(savePoint) } return true @@ -493,7 +489,6 @@ extension Processor { _checkInvariants() } let (opcode, payload) = fetch().destructure - // print("cycle \(currentPC) \(opcode) | pos \(input.distance(from: input.startIndex, to: currentPosition))") switch opcode { case .invalid: fatalError("Invalid program") From c0bc139291e5f46844ed3eda5d196301224b70b8 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Wed, 13 Jul 2022 15:35:23 -0700 Subject: [PATCH 17/35] Clean up loop structure in runQuantify --- .../_StringProcessing/Engine/Processor.swift | 49 +++++++------------ Tests/RegexTests/MatchTests.swift | 13 ----- 2 files changed, 19 insertions(+), 43 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 9f2728ce4..887c05698 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -354,52 +354,41 @@ extension Processor { builtin = nil } - // fixme: is there a way structure the loops so we don't duplicate the - // exit policy here? - if payload.minTrips == 0 { - // exit policy - extraTrips = extraTrips.map({$0 - 1}) - if payload.quantKind == .eager { - savePoint.additionalPositions.append(currentPosition) + while true { + if trips >= payload.minTrips { + // exit policy + // fixme: is there a way to optimize the next two lines out if we know + // extraTrips is nil? + if extraTrips == 0 { break } // goto exit + extraTrips = extraTrips.map({$0 - 1}) + if payload.quantKind == .eager { + savePoint.additionalPositions.append(currentPosition) + } } - } - // > loop - while true { // fixme: maybe the _do methods should always return the next index, lets - // us remove the res variable entirely. + // us remove the matched variable entirely. // dunno how thatll affect the normal matching instructions tho, I wanted // to leave the normal matching as untouched as possible - let res: Bool + let matched: Bool var next: Input.Index? switch payload.type { case .bitset: - res = _doMatchBitset(bitset!) - next = res ? input.index(after: currentPosition) : nil + matched = _doMatchBitset(bitset!) + next = matched ? input.index(after: currentPosition) : nil case .asciiChar: - (res, next) = _doMatchScalar(scalar!, true) + (matched, next) = _doMatchScalar(scalar!, true) case .builtin: // We only emit .quantify if it is non-strict ascii - (res, next) = _doMatchBuiltin(builtin!, false) + (matched, next) = _doMatchBuiltin(builtin!, false) case .any: - res = currentPosition != input.endIndex && !input[currentPosition].isNewline - next = res ? input.index(after: currentPosition) : nil + matched = currentPosition != input.endIndex && !input[currentPosition].isNewline + next = matched ? input.index(after: currentPosition) : nil } - guard res else { break } // goto exit - + guard matched else { break } // goto exit currentPosition = next! trips += 1 - - // > min-trips control block - if trips < payload.minTrips { continue } // goto loop - - // > exit-policy control block - if extraTrips == 0 { break } // goto exit - extraTrips = extraTrips.map({$0 - 1}) - if payload.quantKind == .eager { - savePoint.additionalPositions.append(currentPosition) - } } // > exit diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index a436a7d6d..377d5a7be 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1953,17 +1953,4 @@ extension RegexTests { expectCompletion(regex: #"(a{,4})*"#, in: "aa") expectCompletion(regex: #"((|)+)*"#, in: "aa") } - - func testFastQuant() throws { - firstMatchTest( - #"a{,2}x"#, input: "123aaaxyz", match: "aax") -// firstMatchTest( -// #"a{1,2}"#, input: "123aaaxyz", match: "aa") -// var r = try! Regex("a|b+|c*", as: Substring.self) -// let matches = "aaabbbccc".matches(of: r) -// print(matches.map(\.output)) -// r._setCompilerOptionsForTesting(.disableOptimizations) -// let unoptMatches = "aaabbbccc".matches(of: r) -// print(unoptMatches.map(\.output)) - } } From 42e5a5845a03160cd234c3dcf8b330dd288b8f81 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Wed, 13 Jul 2022 17:13:07 -0700 Subject: [PATCH 18/35] Use range based save points --- Sources/_StringProcessing/ByteCodeGen.swift | 7 +++- .../Engine/Backtracking.swift | 35 +++++++++++++---- .../Engine/InstPayload.swift | 35 +++++++---------- .../_StringProcessing/Engine/Processor.swift | 39 ++++++++++--------- .../_StringProcessing/Engine/Tracing.swift | 4 +- .../_CharacterClassModel.swift | 7 ++++ 6 files changed, 75 insertions(+), 52 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 243117072..52f5cfb14 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -697,7 +697,10 @@ fileprivate extension Compiler.ByteCodeGen { builder.buildQuantifyAny(kind, minTrips, extraTrips) case .unconverted(let astAtom): if let builtin = astAtom.ast.characterClass?.builtinCC { - assert(!builtin.isStrict(options: options), "Entered emitFastQuant with an invalid case: Strict builtin character class") + assert(!builtin.isStrict(options: options), + "Entered emitFastQuant with an invalid case: Strict builtin character class") + assert(builtin.consumesSingleGrapheme, + "Entered emitFastQuant with an invalid case: Builtin class that does not consume a single grapheme") builder.buildQuantify(builtin: builtin, kind, minTrips, extraTrips) } else { fatalError("Entered emitFastQuant with an invalid case: Not a builtin character class") @@ -882,7 +885,7 @@ extension DSLTree.Node { return !opts.dotMatchesNewline case .unconverted(let astAtom): // Only quantify non-strict built in character classes - if let builtin = astAtom.ast.characterClass?.builtinCC { + if let builtin = astAtom.ast.characterClass?.builtinCC, builtin.consumesSingleGrapheme { return !builtin.isStrict(options: opts) } else { return false diff --git a/Sources/_StringProcessing/Engine/Backtracking.swift b/Sources/_StringProcessing/Engine/Backtracking.swift index a178a82eb..fbde29917 100644 --- a/Sources/_StringProcessing/Engine/Backtracking.swift +++ b/Sources/_StringProcessing/Engine/Backtracking.swift @@ -13,8 +13,9 @@ extension Processor { struct SavePoint { var pc: InstructionAddress var pos: Position? - // Quantifiers may store many positions to restore to - var additionalPositions: [Position] + // Quantifiers may store a range of positions to restore to + var rangeStart: Position? + var rangeEnd: Position? // The end of the call stack, so we can slice it off // when failing inside a call. // @@ -40,13 +41,20 @@ extension Processor { intRegisters: [Int], PositionRegister: [Input.Index] ) { - assert(additionalPositions.isEmpty) + assert(rangeIsEmpty) return (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters) } - var isEmpty: Bool { additionalPositions.isEmpty } + var rangeIsEmpty: Bool { rangeEnd == nil } - mutating func removeLast() -> ( + mutating func updateRange(newEnd: Input.Index) { + if rangeStart == nil { + rangeStart = newEnd + } + rangeEnd = newEnd + } + + mutating func removeLast(_ input: Input) -> ( pc: InstructionAddress, pos: Position?, stackEnd: CallStackAddress, @@ -54,7 +62,16 @@ extension Processor { intRegisters: [Int], PositionRegister: [Input.Index] ) { - (pc, additionalPositions.removeLast(), stackEnd, captureEnds, intRegisters, posRegisters) + assert(!rangeIsEmpty) + let pos = rangeEnd! + if pos == rangeStart { + // The range is now empty + rangeStart = nil + rangeEnd = nil + } else { + rangeEnd = input.index(before: pos) + } + return (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters) } } @@ -65,7 +82,8 @@ extension Processor { SavePoint( pc: pc, pos: addressOnly ? nil : currentPosition, - additionalPositions: [], + rangeStart: nil, + rangeEnd: nil, stackEnd: .init(callStack.count), captureEnds: storedCaptures, intRegisters: registers.ints, @@ -77,7 +95,8 @@ extension Processor { SavePoint( pc: controller.pc + 1, pos: nil, - additionalPositions: [], + rangeStart: nil, + rangeEnd: nil, stackEnd: .init(callStack.count), captureEnds: storedCaptures, intRegisters: registers.ints, diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index c81fca76d..922aa0250 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -369,23 +369,23 @@ struct QuantifyPayload: RawRepresentable { enum PayloadType: UInt64 { case bitset = 0 - case asciiChar - case any - case builtin + case asciiChar = 1 + case any = 2 + case builtin = 4 } // Future work: optimize this layout -> payload type should be a fast switch // The top 8 bits are reserved for the opcode so we have 56 bits to work with - // b55-b54 - Payload type (one of 4 types) - // b53-b37 - minTrips (16 bit int) - // b37-b20 - extraTrips (16 bit value, one bit for nil) - // b20-b18 - Quantification type (one of three types) - // b18-b16 - Unused - // b16-b0 - Payload value (depends on payload type) - static let quantKindShift: UInt64 = 18 - static let extraTripsShift: UInt64 = 20 - static let minTripsShift: UInt64 = 37 - static let typeShift: UInt64 = 54 + // b55 - Unused + // b54-b51 - Payload type (one of 4 types) + // b51-b35 - minTrips (16 bit int) + // b35-b18 - extraTrips (16 bit value, one bit for nil) + // b18-b16 - Quantification type (one of three types) + // b16-b0 - Payload value (depends on payload type) + static let quantKindShift: UInt64 = 16 + static let extraTripsShift: UInt64 = 18 + static let minTripsShift: UInt64 = 35 + static let typeShift: UInt64 = 51 static func packInfoValues( _ kind: AST.Quantification.Kind, @@ -452,14 +452,7 @@ struct QuantifyPayload: RawRepresentable { } var type: PayloadType { - switch (self.rawValue >> QuantifyPayload.typeShift) & 3 { - case 0: return .bitset - case 1: return .asciiChar - case 2: return .any - case 3: return .builtin - default: - fatalError("Unreachable") - } + PayloadType(rawValue: (self.rawValue >> QuantifyPayload.typeShift) & 7)! } var quantKind: AST.Quantification.Kind { diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 887c05698..1dcd1c825 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -166,6 +166,11 @@ extension Processor { assert(currentPosition != end) input.formIndex(after: ¤tPosition) } + + mutating func _uncheckedForcedConsumeOneScalar() { + assert(currentPosition != end) + input.unicodeScalars.formIndex(after: ¤tPosition) + } // Advance in our input // @@ -244,29 +249,26 @@ extension Processor { currentPosition < end ? input.unicodeScalars[currentPosition] : nil } - func _doMatchScalar(_ s: Unicode.Scalar, _ boundaryCheck: Bool) -> (Bool, Input.Index?) { - // print("doing match scalar \(s)") + func _doMatchScalar(_ s: Unicode.Scalar, _ boundaryCheck: Bool) -> Bool { if s == loadScalar(), let idx = input.unicodeScalars.index( currentPosition, offsetBy: 1, limitedBy: end), (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) { - // print("matched") - return (true, idx) + return true } else { - // print("did not match") - return (false, nil) + return false } } mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { - let (matched, next) = _doMatchScalar(s, boundaryCheck) + let matched = _doMatchScalar(s, boundaryCheck) guard matched else { signalFailure() return false } - currentPosition = next! + _uncheckedForcedConsumeOneScalar() return true } @@ -362,7 +364,7 @@ extension Processor { if extraTrips == 0 { break } // goto exit extraTrips = extraTrips.map({$0 - 1}) if payload.quantKind == .eager { - savePoint.additionalPositions.append(currentPosition) + savePoint.updateRange(newEnd: currentPosition) } } @@ -375,17 +377,16 @@ extension Processor { switch payload.type { case .bitset: matched = _doMatchBitset(bitset!) - next = matched ? input.index(after: currentPosition) : nil case .asciiChar: - (matched, next) = _doMatchScalar(scalar!, true) + matched = _doMatchScalar(scalar!, true) case .builtin: - // We only emit .quantify if it is non-strict ascii - (matched, next) = _doMatchBuiltin(builtin!, false) + // We only emit .quantify if it is non-strict ascii and if it consumes a + // single character + (matched, _) = _doMatchBuiltin(builtin!, false) case .any: matched = currentPosition != input.endIndex && !input[currentPosition].isNewline - next = matched ? input.index(after: currentPosition) : nil } - + next = matched ? input.index(after: currentPosition) : nil guard matched else { break } // goto exit currentPosition = next! trips += 1 @@ -397,7 +398,7 @@ extension Processor { return false } - if payload.quantKind == .eager && !savePoint.isEmpty { + if payload.quantKind == .eager && !savePoint.rangeIsEmpty { savePoints.append(savePoint) } return true @@ -416,9 +417,9 @@ extension Processor { intRegisters: [Int], PositionRegister: [Input.Index] ) - if !savePoint.isEmpty { - (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoint.removeLast() - if !savePoint.isEmpty { + if !savePoint.rangeIsEmpty { + (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoint.removeLast(input) + if !savePoint.rangeIsEmpty { savePoints.append(savePoint) } } else { diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index a829e9d27..cbb065fc1 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -59,10 +59,10 @@ extension Processor.SavePoint { if let p = self.pos { posStr = "\(input.distance(from: input.startIndex, to: p))" } else { - if additionalPositions.isEmpty { + if rangeIsEmpty { posStr = "" } else { - posStr = "\(additionalPositions.map { p in input.distance(from: input.startIndex, to: p) })" + posStr = "\(rangeStart!...rangeEnd!)" } } return """ diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index d91dd4f65..ddc9dddee 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -589,6 +589,13 @@ extension BuiltinCC { default: return false } } + + var consumesSingleGrapheme: Bool { + switch self { + case .anyScalar: return false + default: return true + } + } } extension _CharacterClassModel { From b2dedacba4a2443a40b461ce2b2c7c42c5468fed Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Wed, 13 Jul 2022 17:52:25 -0700 Subject: [PATCH 19/35] Undo the change where I made it recompute index after for some reason --- .../_StringProcessing/Engine/Processor.swift | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 1dcd1c825..06fd73a83 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -166,11 +166,6 @@ extension Processor { assert(currentPosition != end) input.formIndex(after: ¤tPosition) } - - mutating func _uncheckedForcedConsumeOneScalar() { - assert(currentPosition != end) - input.unicodeScalars.formIndex(after: ¤tPosition) - } // Advance in our input // @@ -249,26 +244,26 @@ extension Processor { currentPosition < end ? input.unicodeScalars[currentPosition] : nil } - func _doMatchScalar(_ s: Unicode.Scalar, _ boundaryCheck: Bool) -> Bool { + func _doMatchScalar(_ s: Unicode.Scalar, _ boundaryCheck: Bool) -> (Bool, Input.Index?) { if s == loadScalar(), let idx = input.unicodeScalars.index( currentPosition, offsetBy: 1, limitedBy: end), (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) { - return true + return (true, idx) } else { - return false + return (false, nil) } } mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { - let matched = _doMatchScalar(s, boundaryCheck) + let (matched, next) = _doMatchScalar(s, boundaryCheck) guard matched else { signalFailure() return false } - _uncheckedForcedConsumeOneScalar() + currentPosition = next! return true } @@ -377,16 +372,17 @@ extension Processor { switch payload.type { case .bitset: matched = _doMatchBitset(bitset!) + next = matched ? input.index(after: currentPosition) : nil case .asciiChar: - matched = _doMatchScalar(scalar!, true) + (matched, next) = _doMatchScalar(scalar!, true) case .builtin: // We only emit .quantify if it is non-strict ascii and if it consumes a // single character - (matched, _) = _doMatchBuiltin(builtin!, false) + (matched, next) = _doMatchBuiltin(builtin!, false) case .any: matched = currentPosition != input.endIndex && !input[currentPosition].isNewline + next = matched ? input.index(after: currentPosition) : nil } - next = matched ? input.index(after: currentPosition) : nil guard matched else { break } // goto exit currentPosition = next! trips += 1 From 7b4eaff2c629bcab6ae53a6f7fd4e047e594fa42 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Thu, 14 Jul 2022 12:47:09 -0700 Subject: [PATCH 20/35] More cleanup --- Sources/_StringProcessing/ByteCodeGen.swift | 37 +++++---- Sources/_StringProcessing/Compiler.swift | 4 +- .../Engine/InstPayload.swift | 17 ++-- .../_StringProcessing/Engine/MEBuiltins.swift | 14 ++-- .../_StringProcessing/Engine/Processor.swift | 79 +++++++------------ Tests/RegexTests/CompileTests.swift | 2 +- Tests/RegexTests/MatchTests.swift | 27 +++++++ 7 files changed, 93 insertions(+), 87 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 52f5cfb14..6c3824f21 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -454,27 +454,20 @@ fileprivate extension Compiler.ByteCodeGen { let minTrips = low assert((extraTrips ?? 1) >= 0) - // We want to specialize quantification on certain inner nodes - // Those nodes are: + // We want to specialize common quantification cases + // Allowed nodes are: // - .char // - .customCharacterClass // - built in character classes // - .any - // and only in grapheme semantic mode (fixme: for sure?) + // We do this by wrapping a single instruction in a .quantify instruction - - // Lily note: I dont think we can support reluctant quant with this implementation - // style, or at least it wouldn't be any more efficient than the - // existing way we emit reluctant quantifiers - - // The main issue runQuantify solves is the fact that greedy quantifiers - // will loop through processor inefficiently and generate a ton of save points - let x = 65536 // lily todo: fix this once i determine the bit layout - if optimizationsEnabled && child.shouldDoFastQuant(options) && - minTrips < x && - extraTrips ?? 0 < x && - options.matchLevel == .graphemeCluster && - updatedKind != .reluctant { + if optimizationsEnabled + && child.shouldDoFastQuant(options) + && minTrips <= QuantifyPayload.maxStorableTrips + && extraTrips ?? 0 <= QuantifyPayload.maxStorableTrips + && options.matchLevel == .graphemeCluster + && updatedKind != .reluctant { emitFastQuant(child, updatedKind, minTrips, extraTrips) return } @@ -874,6 +867,11 @@ extension DSLTree.Node { switch self { case .customCharacterClass(let ccc): // Only quantify ascii only character classes + + // Future work: Should we allow ConsumeFunctions into .quantify? + // this would open up non-ascii custom character classes as well as the + // possibility of wrapping weirder cases into consume functions + // (non-ascii characters for example) return ccc.asAsciiBitset(opts) != nil case .atom(let atom): switch atom { @@ -885,7 +883,8 @@ extension DSLTree.Node { return !opts.dotMatchesNewline case .unconverted(let astAtom): // Only quantify non-strict built in character classes - if let builtin = astAtom.ast.characterClass?.builtinCC, builtin.consumesSingleGrapheme { + if let builtin = astAtom.ast.characterClass?.builtinCC, + builtin.consumesSingleGrapheme { return !builtin.isStrict(options: opts) } else { return false @@ -902,6 +901,10 @@ extension DSLTree.Node { default: return false } + case .orderedChoice: + // Future work: Could we support ordered choice by compacting our payload + // representation and supporting an alternation of up to N supported nodes? + return false default: return false } diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 5b4ad2b28..530126a32 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -38,9 +38,7 @@ class Compiler { compileOptions: compileOptions, captureList: tree.captureList) - let p = try codegen.emitRoot(tree.root) - // print(p) - return p + return try codegen.emitRoot(tree.root) } } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 922aa0250..8a6c9d554 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -376,16 +376,17 @@ struct QuantifyPayload: RawRepresentable { // Future work: optimize this layout -> payload type should be a fast switch // The top 8 bits are reserved for the opcode so we have 56 bits to work with - // b55 - Unused - // b54-b51 - Payload type (one of 4 types) - // b51-b35 - minTrips (16 bit int) - // b35-b18 - extraTrips (16 bit value, one bit for nil) + // b55-b38 - Unused + // b38-b35 - Payload type (one of 4 types, stored on 3 bits) + // b35-b27 - minTrips (8 bit int) + // b27-b18 - extraTrips (8 bit value, one bit for nil) // b18-b16 - Quantification type (one of three types) // b16-b0 - Payload value (depends on payload type) static let quantKindShift: UInt64 = 16 static let extraTripsShift: UInt64 = 18 - static let minTripsShift: UInt64 = 35 - static let typeShift: UInt64 = 51 + static let minTripsShift: UInt64 = 27 + static let typeShift: UInt64 = 35 + static let maxStorableTrips: UInt64 = (1 << 8) - 1 static func packInfoValues( _ kind: AST.Quantification.Kind, @@ -466,11 +467,11 @@ struct QuantifyPayload: RawRepresentable { } var minTrips: UInt64 { - (self.rawValue >> QuantifyPayload.minTripsShift) & 0xFF_FF + (self.rawValue >> QuantifyPayload.minTripsShift) & 0xFF } var extraTrips: UInt64? { - let val = (self.rawValue >> QuantifyPayload.extraTripsShift) & 0x1FF_FF + let val = (self.rawValue >> QuantifyPayload.extraTripsShift) & 0x1FF if val == 1 { return nil } else { diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index d677784b2..516d46eed 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -4,9 +4,9 @@ extension Processor { mutating func _doMatchBuiltin( _ cc: BuiltinCC, _ isStrictAscii: Bool - ) -> (Bool, Input.Index?) { + ) -> Input.Index? { guard let c = load() else { - return (false, nil) + return nil } var matched: Bool @@ -31,21 +31,19 @@ extension Processor { case .word: matched = c.isWordCharacter && (c.isASCII || !isStrictAscii) } - return (matched, next) + return matched ? next : nil } mutating func matchBuiltin( _ cc: BuiltinCC, _ isStrictAscii: Bool ) -> Bool { - let (matched, next) = _doMatchBuiltin(cc, isStrictAscii) - if matched { - currentPosition = next! - return true - } else { + guard let next = _doMatchBuiltin(cc, isStrictAscii) else { signalFailure() return false } + currentPosition = next + return true } mutating func matchBuiltinScalar( diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 06fd73a83..4d2e78e5f 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -244,26 +244,25 @@ extension Processor { currentPosition < end ? input.unicodeScalars[currentPosition] : nil } - func _doMatchScalar(_ s: Unicode.Scalar, _ boundaryCheck: Bool) -> (Bool, Input.Index?) { + func _doMatchScalar(_ s: Unicode.Scalar, _ boundaryCheck: Bool) -> Input.Index? { if s == loadScalar(), let idx = input.unicodeScalars.index( currentPosition, offsetBy: 1, limitedBy: end), (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) { - return (true, idx) + return idx } else { - return (false, nil) + return nil } } mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { - let (matched, next) = _doMatchScalar(s, boundaryCheck) - guard matched else { + guard let next = _doMatchScalar(s, boundaryCheck) else { signalFailure() return false } - currentPosition = next! + currentPosition = next return true } @@ -286,11 +285,11 @@ extension Processor { return true } - func _doMatchBitset(_ bitset: DSLTree.CustomCharacterClass.AsciiBitset) -> Bool { + func _doMatchBitset(_ bitset: DSLTree.CustomCharacterClass.AsciiBitset) -> Input.Index? { if let cur = load(), bitset.matches(char: cur) { - return true + return input.index(after: currentPosition) } else { - return false + return nil } } @@ -300,11 +299,11 @@ extension Processor { mutating func matchBitset( _ bitset: DSLTree.CustomCharacterClass.AsciiBitset ) -> Bool { - guard _doMatchBitset(bitset) else { + guard let next = _doMatchBitset(bitset) else { signalFailure() return false } - _uncheckedForcedConsumeOne() + currentPosition = next return true } @@ -326,69 +325,49 @@ extension Processor { var trips = 0 var extraTrips = payload.extraTrips var savePoint = startQuantifierSavePoint() - - // Initialize values - // lily note: I hope swift/llvm is smart enough to recognize the code paths - // and elide the unwrapping checks in the hot loop, but I'm not sure - let bitset: DSLTree.CustomCharacterClass.AsciiBitset? - switch payload.type { - case .bitset: - bitset = registers[payload.bitset] - default: - bitset = nil - } - let scalar: UnicodeScalar? - switch payload.type { - case .asciiChar: - scalar = UnicodeScalar.init(_value: UInt32(payload.asciiChar)) - default: scalar = nil - } - let builtin: BuiltinCC? - switch payload.type { - case .builtin: - builtin = payload.builtin - default: - builtin = nil - } while true { if trips >= payload.minTrips { // exit policy // fixme: is there a way to optimize the next two lines out if we know // extraTrips is nil? - if extraTrips == 0 { break } // goto exit + if extraTrips == 0 { break } extraTrips = extraTrips.map({$0 - 1}) if payload.quantKind == .eager { savePoint.updateRange(newEnd: currentPosition) } } - // fixme: maybe the _do methods should always return the next index, lets - // us remove the matched variable entirely. - // dunno how thatll affect the normal matching instructions tho, I wanted - // to leave the normal matching as untouched as possible - let matched: Bool + // Future work: Do we want to rework our Processor.Cycle() switch loop + // to do something like this for all of the matching instructions? + // ie: A bunch of _doMatchThing instructions that return Input.Index? + // which we then signalFailure if nil or currentPosition = next otherwise + // This would have the benefit of potentially allowing us to not duplicate + // code between the normal matching instructions and this loop here var next: Input.Index? switch payload.type { case .bitset: - matched = _doMatchBitset(bitset!) - next = matched ? input.index(after: currentPosition) : nil + next = _doMatchBitset(registers[payload.bitset]) case .asciiChar: - (matched, next) = _doMatchScalar(scalar!, true) + next = _doMatchScalar( + UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true) case .builtin: // We only emit .quantify if it is non-strict ascii and if it consumes a // single character - (matched, next) = _doMatchBuiltin(builtin!, false) + next = _doMatchBuiltin(payload.builtin, false) case .any: - matched = currentPosition != input.endIndex && !input[currentPosition].isNewline + // We only emit if any does not match newline + // Fixme: we could emit if it matches newline by just having a bit in + // the payload, the any payload is empty anyway + let matched = currentPosition != input.endIndex && + !input[currentPosition].isNewline next = matched ? input.index(after: currentPosition) : nil } - guard matched else { break } // goto exit - currentPosition = next! + guard let idx = next else { break } + currentPosition = idx trips += 1 } - - // > exit + if trips < payload.minTrips { signalFailure() return false diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index a14b87536..279e72b0b 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -267,7 +267,7 @@ extension RegexTests { matchingOptions(adding: [.caseInsensitive])) } - private func expectProgram( + func expectProgram( for regex: String, syntax: SyntaxOptions = .traditional, semanticLevel: RegexSemanticLevel? = nil, diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 377d5a7be..162ad8a2d 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1953,4 +1953,31 @@ extension RegexTests { expectCompletion(regex: #"(a{,4})*"#, in: "aa") expectCompletion(regex: #"((|)+)*"#, in: "aa") } + + func testQuantifyOptimization() throws { + // test that the maximum values for minTrips and extraTrips are handled correctly + let maxStorable = Int(QuantifyPayload.maxStorableTrips) + let maxExtraTrips = "a{,\(maxStorable)}" + expectProgram(for: maxExtraTrips, contains: [.quantify]) + firstMatchTest(maxExtraTrips, input: String(repeating: "a", count: maxStorable), match: String(repeating: "a", count: maxStorable)) + firstMatchTest(maxExtraTrips, input: String(repeating: "a", count: maxStorable + 1), match: String(repeating: "a", count: maxStorable)) + XCTAssertNil(try Regex(maxExtraTrips).wholeMatch(in: String(repeating: "a", count: maxStorable + 1))) + + let maxMinTrips = "a{\(maxStorable),}" + expectProgram(for: maxMinTrips, contains: [.quantify]) + firstMatchTest(maxMinTrips, input: String(repeating: "a", count: maxStorable), match: String(repeating: "a", count: maxStorable)) + firstMatchTest(maxMinTrips, input: String(repeating: "a", count: maxStorable - 1), match: nil) + + let maxBothTrips = "a{\(maxStorable),\(maxStorable*2)}" + expectProgram(for: maxBothTrips, contains: [.quantify]) + XCTAssertNil(try Regex(maxBothTrips).wholeMatch(in: String(repeating: "a", count: maxStorable*2 + 1))) + firstMatchTest(maxBothTrips, input: String(repeating: "a", count: maxStorable*2), match: String(repeating: "a", count: maxStorable*2)) + firstMatchTest(maxBothTrips, input: String(repeating: "a", count: maxStorable), match: String(repeating: "a", count: maxStorable)) + firstMatchTest(maxBothTrips, input: String(repeating: "a", count: maxStorable - 1), match: nil) + + expectProgram(for: "a{,\(maxStorable+1)}", doesNotContain: [.quantify]) + expectProgram(for: "a{\(maxStorable+1),}", doesNotContain: [.quantify]) + expectProgram(for: "a{\(maxStorable-1),\(maxStorable*2)}", doesNotContain: [.quantify]) + expectProgram(for: "a{\(maxStorable),\(maxStorable*2+1)}", doesNotContain: [.quantify]) + } } From fb1576a7b99fd981d2af4d99c0362a7c94d30761 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Fri, 15 Jul 2022 14:18:14 -0700 Subject: [PATCH 21/35] Update branch to match main --- Sources/_StringProcessing/ByteCodeGen.swift | 26 ---- .../_StringProcessing/Engine/MEBuilder.swift | 2 +- .../_StringProcessing/Engine/MEBuiltins.swift | 129 +++++++----------- Sources/_StringProcessing/Regex/DSLTree.swift | 4 +- .../_CharacterClassModel.swift | 1 + 5 files changed, 57 insertions(+), 105 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 5ef71824a..276c80fe2 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -156,32 +156,6 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitStartOfLine() { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.lowerBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[input.index(before: pos)].isNewline - case .unicodeScalar: - return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline - } - } - } - - mutating func emitEndOfLine() { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[pos].isNewline - case .unicodeScalar: - return input.unicodeScalars[pos].isNewline - } - } - } - mutating func emitAssertion( _ kind: DSLTree.Atom.Assertion ) throws { diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index b0735c160..44015e87e 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -188,7 +188,7 @@ extension MEProgram.Builder { } mutating func buildAssert( - by kind: AST.Atom.AssertionKind, + by kind: DSLTree.Atom.Assertion, _ anchorsMatchNewlines: Bool, _ usesSimpleUnicodeBoundaries: Bool, _ usesASCIIWord: Bool, diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index f79e8f463..af42fe9de 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -86,6 +86,26 @@ extension Processor { return false } } + + func isAtStartOfLine(_ payload: AssertionPayload) -> Bool { + if currentPosition == subjectBounds.lowerBound { return true } + switch payload.semanticLevel { + case .graphemeCluster: + return input[input.index(before: currentPosition)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline + } + } + + func isAtEndOfLine(_ payload: AssertionPayload) -> Bool { + if currentPosition == subjectBounds.upperBound { return true } + switch payload.semanticLevel { + case .graphemeCluster: + return input[currentPosition].isNewline + case .unicodeScalar: + return input.unicodeScalars[currentPosition].isNewline + } + } mutating func builtinAssert(by payload: AssertionPayload) throws -> Bool { // Future work: Optimize layout and dispatch @@ -116,54 +136,39 @@ extension Processor { case .notTextSegment: return !input.isOnGraphemeClusterBoundary(currentPosition) case .startOfLine: - // FIXME: Anchor.startOfLine must always use this first branch - // The behavior of `^` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.startOfLine` anchor should always match the start - // of a line. Right now we don't distinguish between those anchors. + return isAtStartOfLine(payload) + case .endOfLine: + return isAtEndOfLine(payload) + + case .caretAnchor: if payload.anchorsMatchNewlines { - if currentPosition == subjectBounds.lowerBound { return true } - switch payload.semanticLevel { - case .graphemeCluster: - return input[input.index(before: currentPosition)].isNewline - case .unicodeScalar: - return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline - } + return isAtStartOfLine(payload) } else { return currentPosition == subjectBounds.lowerBound } - - case .endOfLine: - // FIXME: Anchor.endOfLine must always use this first branch - // The behavior of `$` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.endOfLine` anchor should always match the end - // of a line. Right now we don't distinguish between those anchors. - if payload.anchorsMatchNewlines { - if currentPosition == subjectBounds.upperBound { return true } - switch payload.semanticLevel { - case .graphemeCluster: - return input[currentPosition].isNewline - case .unicodeScalar: - return input.unicodeScalars[currentPosition].isNewline - } - } else { - return currentPosition == subjectBounds.upperBound - } - - case .wordBoundary: - if payload.usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? - return atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) - } else { - return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) - } - - case .notWordBoundary: - if payload.usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? - return !atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) - } else { - return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) - } + + case .dollarAnchor: + if payload.anchorsMatchNewlines { + return isAtEndOfLine(payload) + } else { + return currentPosition == subjectBounds.upperBound + } + + case .wordBoundary: + if payload.usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) + } else { + return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + } + + case .notWordBoundary: + if payload.usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return !atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) + } else { + return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + } } } } @@ -179,7 +184,7 @@ struct AssertionPayload: RawRepresentable { assert(rawValue & _opcodeMask == 0) } - init(_ assertion: AST.Atom.AssertionKind, + init(_ assertion: DSLTree.Atom.Assertion, _ anchorsMatchNewlines: Bool, _ usesSimpleUnicodeBoundaries: Bool, _ usesASCIIWord: Bool, @@ -194,40 +199,12 @@ struct AssertionPayload: RawRepresentable { // 4 bits for the assertion kind // Future work: Optimize this layout - let kind: UInt64 - switch assertion { - case .endOfLine: kind = 0 - case .endOfSubject: kind = 1 - case .endOfSubjectBeforeNewline: kind = 2 - case .firstMatchingPositionInSubject: kind = 3 - case .notTextSegment: kind = 4 - case .notWordBoundary: kind = 5 - case .resetStartOfMatch: kind = 6 - case .startOfLine: kind = 7 - case .startOfSubject: kind = 8 - case .textSegment: kind = 9 - case .wordBoundary: kind = 10 - } + let kind = assertion.rawValue self.init(rawValue: kind + optionsBits) } - var kind: AST.Atom.AssertionKind { - let kind: AST.Atom.AssertionKind - switch self.rawValue & _assertionKindMask { - case 0: kind = .endOfLine - case 1: kind = .endOfSubject - case 2: kind = .endOfSubjectBeforeNewline - case 3: kind = .firstMatchingPositionInSubject - case 4: kind = .notTextSegment - case 5: kind = .notWordBoundary - case 6: kind = .resetStartOfMatch - case 7: kind = .startOfLine - case 8: kind = .startOfSubject - case 9: kind = .textSegment - case 10: kind = .wordBoundary - default: fatalError("Unreachable") - } - return kind + var kind: DSLTree.Atom.Assertion { + return .init(rawValue: self.rawValue & _assertionKindMask)! } var anchorsMatchNewlines: Bool { (self.rawValue >> 55) & 1 == 1 } var usesSimpleUnicodeBoundaries: Bool { (self.rawValue >> 54) & 1 == 1 } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 4ea905fd5..a98bd8441 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -189,9 +189,9 @@ extension DSLTree { extension DSLTree.Atom { @_spi(RegexBuilder) - public enum Assertion: Hashable { + public enum Assertion: UInt64, Hashable { /// \A - case startOfSubject + case startOfSubject = 0 /// \Z case endOfSubjectBeforeNewline diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 003d5037a..f32e74693 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -316,6 +316,7 @@ extension BuiltinCC { case .hexDigit: return options.usesASCIIDigits case .horizontalWhitespace: return options.usesASCIISpaces case .newlineSequence: return options.usesASCIISpaces + case .verticalWhitespace: return options.usesASCIISpaces case .whitespace: return options.usesASCIISpaces case .word: return options.usesASCIIWord default: return false From 3b9485efd270038a4aee5d61cc8a2a851390b213 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Fri, 15 Jul 2022 17:35:34 -0700 Subject: [PATCH 22/35] Use the newly cleaned up _CharacterClassModel --- Sources/RegexBuilder/CharacterClass.swift | 25 +++--- Sources/_StringProcessing/ByteCodeGen.swift | 11 ++- .../_StringProcessing/ConsumerInterface.swift | 1 - .../Engine/InstPayload.swift | 53 +++++++++---- .../_StringProcessing/Engine/MEBuilder.swift | 4 +- .../_StringProcessing/Engine/MEBuiltins.swift | 18 +++-- .../_StringProcessing/Engine/Processor.swift | 8 +- .../Utility/RegexFactory.swift | 8 ++ .../_CharacterClassModel.swift | 79 +++++-------------- 9 files changed, 105 insertions(+), 102 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index ea52c28f3..289a8c66b 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -15,20 +15,27 @@ @available(SwiftStdlib 5.7, *) public struct CharacterClass { internal var ccc: DSLTree.CustomCharacterClass + internal var builtin: DSLTree._AST.Atom? // lily note: This seems illegal init(_ ccc: DSLTree.CustomCharacterClass) { self.ccc = ccc + self.builtin = nil } - init(unconverted atom: DSLTree._AST.Atom) { + init(builtin atom: DSLTree._AST.Atom) { self.ccc = .init(members: [.atom(.unconverted(atom))]) + self.builtin = atom } } @available(SwiftStdlib 5.7, *) extension CharacterClass: RegexComponent { public var regex: Regex { - _RegexFactory().customCharacterClass(ccc) + if let unconverted = builtin { + return _RegexFactory().unconverted(unconverted) + } else { + return _RegexFactory().customCharacterClass(ccc) + } } } @@ -50,15 +57,15 @@ extension RegexComponent where Self == CharacterClass { } public static var anyGraphemeCluster: CharacterClass { - .init(unconverted: ._anyGrapheme) + .init(builtin: ._anyGrapheme) } public static var whitespace: CharacterClass { - .init(unconverted: ._whitespace) + .init(builtin: ._whitespace) } public static var digit: CharacterClass { - .init(unconverted: ._digit) + .init(builtin: ._digit) } public static var hexDigit: CharacterClass { @@ -70,19 +77,19 @@ extension RegexComponent where Self == CharacterClass { } public static var horizontalWhitespace: CharacterClass { - .init(unconverted: ._horizontalWhitespace) + .init(builtin: ._horizontalWhitespace) } public static var newlineSequence: CharacterClass { - .init(unconverted: ._newlineSequence) + .init(builtin: ._newlineSequence) } public static var verticalWhitespace: CharacterClass { - .init(unconverted: ._verticalWhitespace) + .init(builtin: ._verticalWhitespace) } public static var word: CharacterClass { - .init(unconverted: ._word) + .init(builtin: ._word) } } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 276c80fe2..88ec2de0a 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -90,11 +90,10 @@ fileprivate extension Compiler.ByteCodeGen { options.apply(optionSequence.ast) case let .unconverted(astAtom): - if optimizationsEnabled, - let cc = astAtom.ast.characterClass?.builtinCC { + if let cc = astAtom.ast.characterClass { builder.buildMatchBuiltin( cc, - cc.isStrict(options: options), + cc.isStrictAscii(options: options), isScalar: options.semanticLevel == .unicodeScalar) return } @@ -666,10 +665,10 @@ fileprivate extension Compiler.ByteCodeGen { } else { builder.buildMatchAsciiBitset(asciiBitset) } - } else { - let consumer = try ccc.generateConsumer(options) - builder.buildConsume(by: consumer) + return } + let consumer = try ccc.generateConsumer(options) + builder.buildConsume(by: consumer) } @discardableResult diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 668d16eb6..dbbaf314b 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -269,7 +269,6 @@ extension AST.Atom { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { - // TODO: Wean ourselves off of this type... if let cc = self.characterClass?.withMatchLevel( opts.matchLevel ) { diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 96c307c75..89b0d410b 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -224,20 +224,11 @@ extension Instruction.Payload { return (isScalar: pair.0 == 1, pair.1) } - init(_ cc: BuiltinCC, _ isStrict: Bool, _ isScalar: Bool) { - let strictBit = isStrict ? 1 << 15 : 0 - let scalarBit = isScalar ? 1 << 14 : 0 - // val must be 16 bits, reserve the top 2 bits for if it is strict ascii or scalar - assert(cc.rawValue <= 0x3F_FF) - let val = cc.rawValue + UInt64(strictBit) + UInt64(scalarBit) - self.init(val) - } - var builtinCCPayload: (cc: BuiltinCC, isStrict: Bool, isScalar: Bool) { - let val = self.rawValue - let cc = BuiltinCC(rawValue: val & 0x3F_FF)! - let isStrict = (val >> 15) & 1 == 1 - let isScalar = (val >> 14) & 1 == 1 - return (cc, isStrict, isScalar) + init(_ cc: _CharacterClassModel.Representation, _ isInverted: Bool, _ isStrict: Bool, _ isScalar: Bool) { + self.init(CharacterClassPayload(cc, isInverted, isStrict, isScalar).rawValue) + } + var characterClassPayload: CharacterClassPayload{ + return CharacterClassPayload(rawValue: rawValue & _payloadMask) } init(consumer: ConsumeFunctionRegister) { @@ -355,3 +346,37 @@ extension Instruction.Payload { } } +struct CharacterClassPayload: RawRepresentable { + let rawValue: UInt64 + // Layout: + // Top three bits are isInverted, isStrict, isScalar + // Lower 16 bits are _CCM.Representation + static let invertedShift: UInt64 = 55 + static let strictShift: UInt64 = 54 + static let scalarShift: UInt64 = 53 + static let ccMask: UInt64 = 0xFF + init(rawValue: UInt64) { + assert(rawValue & _opcodeMask == 0) + self.rawValue = rawValue + } + init(_ cc: _CharacterClassModel.Representation, _ isInverted: Bool, _ isStrict: Bool, _ isScalar: Bool) { + let invertedBit = isInverted ? 1 << CharacterClassPayload.invertedShift : 0 + let strictBit = isStrict ? 1 << CharacterClassPayload.strictShift : 0 + let scalarBit = isScalar ? 1 << CharacterClassPayload.scalarShift : 0 + assert(cc.rawValue <= CharacterClassPayload.ccMask) // + self.init(rawValue: cc.rawValue + UInt64(invertedBit) + UInt64(strictBit) + UInt64(scalarBit)) + } + + var isInverted: Bool { + (self.rawValue >> CharacterClassPayload.invertedShift) & 1 == 1 + } + var isStrict: Bool { + (self.rawValue >> CharacterClassPayload.strictShift) & 1 == 1 + } + var isScalar: Bool { + (self.rawValue >> CharacterClassPayload.scalarShift) & 1 == 1 + } + var cc: _CharacterClassModel.Representation { + _CharacterClassModel.Representation.init(rawValue: self.rawValue & CharacterClassPayload.ccMask)! + } +} diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 44015e87e..421ed5da3 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -172,12 +172,12 @@ extension MEProgram.Builder { } mutating func buildMatchBuiltin( - _ cc: BuiltinCC, + _ cc: _CharacterClassModel, _ isStrict: Bool, isScalar: Bool ) { instructions.append(.init( - .matchBuiltin, .init(cc, isStrict, isScalar))) + .matchBuiltin, .init(cc.cc, cc.isInverted, isStrict, isScalar))) } mutating func buildConsume( diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index af42fe9de..83c2e947c 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -2,12 +2,13 @@ extension Processor { mutating func matchBuiltin( - _ cc: BuiltinCC, + _ cc: _CharacterClassModel.Representation, + _ isInverted: Bool, _ isStrictAscii: Bool ) -> Bool { guard let c = load() else { signalFailure() - return false + return isInverted } var matched: Bool @@ -32,7 +33,9 @@ extension Processor { case .word: matched = c.isWordCharacter && (c.isASCII || !isStrictAscii) } - + if isInverted { + matched.toggle() + } if matched { currentPosition = next return true @@ -43,12 +46,13 @@ extension Processor { } mutating func matchBuiltinScalar( - _ cc: BuiltinCC, + _ cc: _CharacterClassModel.Representation, + _ isInverted: Bool, _ isStrictAscii: Bool ) -> Bool { guard let c = loadScalar() else { signalFailure() - return false + return isInverted } var matched: Bool @@ -77,7 +81,9 @@ extension Processor { case .word: matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !isStrictAscii) } - + if isInverted { + matched.toggle() + } if matched { currentPosition = next return true diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 5f56ca881..0907164d8 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -478,13 +478,13 @@ extension Processor { } case .matchBuiltin: - let (cc, isStrict, isScalar) = payload.builtinCCPayload - if isScalar { - if matchBuiltinScalar(cc, isStrict) { + let payload = payload.characterClassPayload + if payload.isScalar { + if matchBuiltinScalar(payload.cc, payload.isInverted, payload.isStrict) { controller.step() } } else { - if matchBuiltin(cc, isStrict) { + if matchBuiltin(payload.cc, payload.isInverted, payload.isStrict) { controller.step() } } diff --git a/Sources/_StringProcessing/Utility/RegexFactory.swift b/Sources/_StringProcessing/Utility/RegexFactory.swift index 31245c0f7..3c2e13a3e 100644 --- a/Sources/_StringProcessing/Utility/RegexFactory.swift +++ b/Sources/_StringProcessing/Utility/RegexFactory.swift @@ -58,6 +58,14 @@ public struct _RegexFactory { ) -> Regex { .init(node: .atom(.scalar(scalar))) } + + @_spi(RegexBuilder) + @available(SwiftStdlib 5.7, *) + public func unconverted( + _ atom: DSLTree._AST.Atom + ) -> Regex { + .init(node: .atom(.unconverted(atom))) + } @_spi(RegexBuilder) @available(SwiftStdlib 5.7, *) diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index f32e74693..282ba1eb2 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -27,9 +27,9 @@ struct _CharacterClassModel: Hashable { var isInverted: Bool = false // TODO: Split out builtin character classes into their own type? - enum Representation: Hashable { + enum Representation: UInt64, Hashable { /// Any character - case any + case any = 0 /// Any grapheme cluster case anyGrapheme /// Any Unicode scalar @@ -70,6 +70,20 @@ struct _CharacterClassModel: Hashable { return result } + /// Returns true if this CharacterClass should be matched by strict ascii under the given options + func isStrictAscii(options: MatchingOptions) -> Bool { + switch self { + case .digit: return options.usesASCIIDigits + case .hexDigit: return options.usesASCIIDigits + case .horizontalWhitespace: return options.usesASCIISpaces + case .newlineSequence: return options.usesASCIISpaces + case .verticalWhitespace: return options.usesASCIISpaces + case .whitespace: return options.usesASCIISpaces + case .word: return options.usesASCIIWord + default: return false + } + } + /// Conditionally inverts a character class. /// /// - Parameter inversion: Indicates whether to invert the character class. @@ -95,6 +109,9 @@ struct _CharacterClassModel: Hashable { /// - Parameter options: Options for the match operation. /// - Returns: The index of the end of the match, or `nil` if there is no match. func matches(in str: String, at i: String.Index, with options: MatchingOptions) -> String.Index? { + // FIXME: This is only called in custom character classes that contain builtin + // character classes as members (ie: [a\w] or set operations), is there + // any way to avoid that? Can we remove this somehow? switch matchLevel { case .graphemeCluster: let c = str[i] @@ -295,61 +312,3 @@ extension AST.Atom.EscapedBuiltin { } } } - -internal enum BuiltinCC: UInt64 { - case any = 1 - case anyGrapheme - case anyScalar - case digit - case hexDigit - case horizontalWhitespace - case newlineSequence - case verticalWhitespace - case whitespace - case word -} - -extension BuiltinCC { - func isStrict(options: MatchingOptions) -> Bool { - switch self { - case .digit: return options.usesASCIIDigits - case .hexDigit: return options.usesASCIIDigits - case .horizontalWhitespace: return options.usesASCIISpaces - case .newlineSequence: return options.usesASCIISpaces - case .verticalWhitespace: return options.usesASCIISpaces - case .whitespace: return options.usesASCIISpaces - case .word: return options.usesASCIIWord - default: return false - } - } -} - -extension _CharacterClassModel { - internal var builtinCC: BuiltinCC? { - // Future work: Make CCM always either a BuiltinCC or convertable to a - // custom character class - if isInverted { return nil } - switch self.cc { - case .any: - return .any - case .anyGrapheme: - return .anyGrapheme - case .anyScalar: - return .anyScalar - case .digit: - return .digit - case .hexDigit: - return .hexDigit - case .horizontalWhitespace: - return .horizontalWhitespace - case .newlineSequence: - return .newlineSequence - case .verticalWhitespace: - return .verticalWhitespace - case .whitespace: - return .whitespace - case .word: - return .word - } - } -} From 64d1ed9d230975967762952ef9cb9dd636bd25da Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Fri, 15 Jul 2022 18:56:24 -0700 Subject: [PATCH 23/35] Add characterClass DSLTree node --- Sources/RegexBuilder/CharacterClass.swift | 37 +++++---- Sources/_StringProcessing/ByteCodeGen.swift | 19 +++-- .../_StringProcessing/ConsumerInterface.swift | 26 ++++--- .../_StringProcessing/Engine/MEBuiltins.swift | 4 - .../_StringProcessing/PrintAsPattern.swift | 39 ++++++++++ .../Regex/ASTConversion.swift | 37 ++++++++- Sources/_StringProcessing/Regex/DSLTree.swift | 69 +++++++++------- .../Utility/RegexFactory.swift | 6 +- .../_CharacterClassModel.swift | 78 ++++--------------- 9 files changed, 181 insertions(+), 134 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index 289a8c66b..4cb0a5e42 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -15,24 +15,25 @@ @available(SwiftStdlib 5.7, *) public struct CharacterClass { internal var ccc: DSLTree.CustomCharacterClass - internal var builtin: DSLTree._AST.Atom? // lily note: This seems illegal + /// The builtin character class, if this CharacterClass is representable by one + internal var builtin: DSLTree.Atom.CharacterClass? init(_ ccc: DSLTree.CustomCharacterClass) { self.ccc = ccc self.builtin = nil } - init(builtin atom: DSLTree._AST.Atom) { - self.ccc = .init(members: [.atom(.unconverted(atom))]) - self.builtin = atom + init(builtin: DSLTree.Atom.CharacterClass) { + self.ccc = .init(members: [.atom(.characterClass(builtin))]) + self.builtin = builtin } } @available(SwiftStdlib 5.7, *) extension CharacterClass: RegexComponent { public var regex: Regex { - if let unconverted = builtin { - return _RegexFactory().unconverted(unconverted) + if let cc = builtin { + return _RegexFactory().characterClass(cc) } else { return _RegexFactory().customCharacterClass(ccc) } @@ -42,7 +43,15 @@ extension CharacterClass: RegexComponent { @available(SwiftStdlib 5.7, *) extension CharacterClass { public var inverted: CharacterClass { - CharacterClass(ccc.inverted) + return CharacterClass(ccc.inverted) + // lily fixme: this causes a precondition to fail in Capture.swift... why? + // why are the inverted builtins causing issues? + // Match tests are all passing +// if let inv = builtin?.inverted { +// return CharacterClass(builtin: inv) +// } else { +// return CharacterClass(ccc.inverted) +// } } } @@ -57,15 +66,15 @@ extension RegexComponent where Self == CharacterClass { } public static var anyGraphemeCluster: CharacterClass { - .init(builtin: ._anyGrapheme) + .init(builtin: .anyGrapheme) } public static var whitespace: CharacterClass { - .init(builtin: ._whitespace) + .init(builtin: .whitespace) } public static var digit: CharacterClass { - .init(builtin: ._digit) + .init(builtin: .digit) } public static var hexDigit: CharacterClass { @@ -77,19 +86,19 @@ extension RegexComponent where Self == CharacterClass { } public static var horizontalWhitespace: CharacterClass { - .init(builtin: ._horizontalWhitespace) + .init(builtin: .horizontalWhitespace) } public static var newlineSequence: CharacterClass { - .init(builtin: ._newlineSequence) + .init(builtin: .newlineSequence) } public static var verticalWhitespace: CharacterClass { - .init(builtin: ._verticalWhitespace) + .init(builtin: .verticalWhitespace) } public static var word: CharacterClass { - .init(builtin: ._word) + .init(builtin: .word) } } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 88ec2de0a..d6277d5c9 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -74,6 +74,9 @@ fileprivate extension Compiler.ByteCodeGen { emitMatchScalar(s) } + case let .characterClass(cc): + emitCharacterClass(cc) + case let .assertion(kind): try emitAssertion(kind) @@ -90,13 +93,6 @@ fileprivate extension Compiler.ByteCodeGen { options.apply(optionSequence.ast) case let .unconverted(astAtom): - if let cc = astAtom.ast.characterClass { - builder.buildMatchBuiltin( - cc, - cc.isStrictAscii(options: options), - isScalar: options.semanticLevel == .unicodeScalar) - return - } if let consumer = try astAtom.ast.generateConsumer(options) { builder.buildConsume(by: consumer) } else { @@ -168,7 +164,14 @@ fileprivate extension Compiler.ByteCodeGen { options.usesASCIIWord, options.semanticLevel) } - + + mutating func emitCharacterClass(_ cc: DSLTree.Atom.CharacterClass) { + builder.buildMatchBuiltin( + cc.model, + cc.model.isStrictAscii(options: options), + isScalar: options.semanticLevel == .unicodeScalar) + } + mutating func emitMatchScalar(_ s: UnicodeScalar) { assert(options.semanticLevel == .unicodeScalar) if options.isCaseInsensitive && s.properties.isCased { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index dbbaf314b..b37b9341a 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -162,6 +162,8 @@ extension DSLTree.Atom { case .assertion: // TODO: We could handle, should this be total? return nil + case .characterClass(let cc): + return cc.generateConsumer(opts) case .backreference: // TODO: Should we handle? @@ -182,6 +184,16 @@ extension DSLTree.Atom { } } +extension DSLTree.Atom.CharacterClass { + func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction { + return { input, bounds in + // FIXME: should we worry about out of bounds? + model.withMatchLevel(opts.matchLevel) + .matches(in: input, at: bounds.lowerBound, with: opts) + } + } +} + extension String { /// Compares this string to `other` using the loose matching rule UAX44-LM2, /// which ignores case, whitespace, underscores, and nearly all medial @@ -269,15 +281,6 @@ extension AST.Atom { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { - if let cc = self.characterClass?.withMatchLevel( - opts.matchLevel - ) { - return { input, bounds in - // FIXME: should we worry about out of bounds? - cc.matches(in: input, at: bounds.lowerBound, with: opts) - } - } - switch kind { case let .scalar(s): assertionFailure( @@ -311,8 +314,11 @@ extension AST.Atom { case .caretAnchor, .dollarAnchor: // handled in emitAssertion return nil + case .escaped: + // handled in emitAssertion and emitCharacterClass + return nil - case .scalarSequence, .escaped, .keyboardControl, .keyboardMeta, + case .scalarSequence, .keyboardControl, .keyboardMeta, .keyboardMetaControl, .backreference, .subpattern, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: // FIXME: implement diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 83c2e947c..4f14b0a06 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -20,8 +20,6 @@ extension Processor { next = input.unicodeScalars.index(after: currentPosition) case .digit: matched = c.isNumber && (c.isASCII || !isStrictAscii) - case .hexDigit: - matched = c.isHexDigit && (c.isASCII || !isStrictAscii) case .horizontalWhitespace: matched = c.unicodeScalars.first?.isHorizontalWhitespace == true && (c.isASCII || !isStrictAscii) @@ -65,8 +63,6 @@ extension Processor { next = input.index(after: currentPosition) case .digit: matched = c.properties.numericType != nil && (c.isASCII || !isStrictAscii) - case .hexDigit: - matched = Character(c).isHexDigit && (c.isASCII || !isStrictAscii) case .horizontalWhitespace: matched = c.isHorizontalWhitespace && (c.isASCII || !isStrictAscii) case .verticalWhitespace: diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 80f2e7697..321c27747 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -684,6 +684,41 @@ extension DSLTree.Atom.Assertion { } } +extension DSLTree.Atom.CharacterClass { + var _patternBase: String { + switch self { + case .anyGrapheme: + return ".anyGraphemeCluster" + case .anyUnicodeScalar: + return ".anyUnicodeScalar" + case .digit: + return ".digit" + case .notDigit: + return ".digit.inverted" + case .word: + return ".word" + case .notWord: + return ".word.inverted" + case .horizontalWhitespace: + return ".horizontalWhitespace" + case .notHorizontalWhitespace: + return ".horizontalWhitespace.inverted" + case .newlineSequence: + return ".newlineSequence" + case .notNewline: + return ".newlineSequence.inverted" + case .verticalWhitespace: + return ".verticalWhitespace" + case .notVerticalWhitespace: + return ".vertialWhitespace.inverted" + case .whitespace: + return ".whitespace" + case .notWhitespace: + return ".whitespace.inverted" + } + } +} + extension AST.Atom.CharacterProperty { var isUnprintableProperty: Bool { switch kind { @@ -1156,6 +1191,8 @@ extension DSLTree.Atom { case .assertion(let a): return (a._patternBase, false) + case .characterClass(let cc): + return (cc._patternBase, true) case .backreference(_): return ("/* TOOD: backreferences */", false) @@ -1200,6 +1237,8 @@ extension DSLTree.Atom { case .assertion: return "/* TODO: assertions */" + case .characterClass: + return "/* TODO: character classes */" case .backreference: return "/* TOOD: backreferences */" case .symbolicReference: diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index c4ac8e759..1702b7761 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -223,6 +223,25 @@ extension AST.Atom.EscapedBuiltin { default: return nil } } + var dslCharacterClass: DSLTree.Atom.CharacterClass? { + switch self { + case .decimalDigit: return .digit + case .notDecimalDigit: return .notDigit + case .horizontalWhitespace: return .horizontalWhitespace + case .notHorizontalWhitespace: return .notHorizontalWhitespace + case .newlineSequence: return .newlineSequence + case .notNewline: return .notNewline + case .whitespace: return .whitespace + case .notWhitespace: return .notWhitespace + case .verticalTab: return .verticalWhitespace + case .notVerticalTab: return .notVerticalWhitespace + case .wordCharacter: return .word + case .notWordCharacter: return .notWord + case .graphemeCluster: return .anyGrapheme + case .trueAnychar: return .anyUnicodeScalar + default: return nil + } + } } extension AST.Atom { @@ -234,6 +253,12 @@ extension AST.Atom { default: return nil } } + var dslCharacterClass: DSLTree.Atom.CharacterClass? { + switch kind { + case .escaped(let b): return b.dslCharacterClass + default: return nil + } + } } extension AST.Atom { @@ -241,6 +266,10 @@ extension AST.Atom { if let kind = dslAssertionKind { return .assertion(kind) } + + if let cc = dslCharacterClass { + return .characterClass(cc) + } switch self.kind { case let .char(c): return .char(c) @@ -249,9 +278,11 @@ extension AST.Atom { case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) - case .escaped(let c) where c.scalarValue != nil: - return .scalar(c.scalarValue!) - + case .escaped(let c): + guard let val = c.scalarValue else { + fatalError("Got a .escaped that was not an assertion, character class, or scalar value \(self)") + } + return .scalar(val) default: return .unconverted(.init(ast: self)) } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index a98bd8441..b073511c1 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -177,6 +177,7 @@ extension DSLTree { /// newlines unless single line mode is enabled. case dot + case characterClass(CharacterClass) case assertion(Assertion) case backreference(_AST.Reference) case symbolicReference(ReferenceID) @@ -231,6 +232,46 @@ extension DSLTree.Atom { /// \B case notWordBoundary } + + @_spi(RegexBuilder) + public enum CharacterClass: Hashable { + case digit + case notDigit + case horizontalWhitespace + case notHorizontalWhitespace + case newlineSequence + case notNewline + case whitespace + case notWhitespace + case verticalWhitespace + case notVerticalWhitespace + case word + case notWord + case anyGrapheme + case anyUnicodeScalar + } +} + +extension DSLTree.Atom.CharacterClass { + @_spi(RegexBuilder) + public var inverted: DSLTree.Atom.CharacterClass? { + switch self { + case .anyGrapheme: return nil + case .anyUnicodeScalar: return nil + case .digit: return .notDigit + case .notDigit: return .digit + case .word: return .notWord + case .notWord: return .word + case .horizontalWhitespace: return .notHorizontalWhitespace + case .notHorizontalWhitespace: return .horizontalWhitespace + case .newlineSequence: return .notNewline + case .notNewline: return .newlineSequence + case .verticalWhitespace: return .notVerticalWhitespace + case .notVerticalWhitespace: return .verticalWhitespace + case .whitespace: return .notWhitespace + case .notWhitespace: return .whitespace + } + } } extension Unicode.GeneralCategory { @@ -759,34 +800,8 @@ extension DSLTree { internal var ast: AST.MatchingOptionSequence } - @_spi(RegexBuilder) public struct Atom { internal var ast: AST.Atom - - // FIXME: The below APIs should be removed once the DSL tree has been - // migrated to use proper DSL atoms for them. - - public static var _anyGrapheme: Self { - .init(ast: .init(.escaped(.graphemeCluster), .fake)) - } - public static var _whitespace: Self { - .init(ast: .init(.escaped(.whitespace), .fake)) - } - public static var _digit: Self { - .init(ast: .init(.escaped(.decimalDigit), .fake)) - } - public static var _horizontalWhitespace: Self { - .init(ast: .init(.escaped(.horizontalWhitespace), .fake)) - } - public static var _newlineSequence: Self { - .init(ast: .init(.escaped(.newlineSequence), .fake)) - } - public static var _verticalWhitespace: Self { - .init(ast: .init(.escaped(.verticalTab), .fake)) - } - public static var _word: Self { - .init(ast: .init(.escaped(.wordCharacter), .fake)) - } } } } @@ -800,7 +815,7 @@ extension DSLTree.Atom { case .changeMatchingOptions, .assertion: return false case .char, .scalar, .any, .anyNonNewline, .dot, .backreference, - .symbolicReference, .unconverted: + .symbolicReference, .unconverted, .characterClass: return true } } diff --git a/Sources/_StringProcessing/Utility/RegexFactory.swift b/Sources/_StringProcessing/Utility/RegexFactory.swift index 3c2e13a3e..e0df906fa 100644 --- a/Sources/_StringProcessing/Utility/RegexFactory.swift +++ b/Sources/_StringProcessing/Utility/RegexFactory.swift @@ -61,10 +61,10 @@ public struct _RegexFactory { @_spi(RegexBuilder) @available(SwiftStdlib 5.7, *) - public func unconverted( - _ atom: DSLTree._AST.Atom + public func characterClass( + _ cc: DSLTree.Atom.CharacterClass ) -> Regex { - .init(node: .atom(.unconverted(atom))) + .init(node: .atom(.characterClass(cc))) } @_spi(RegexBuilder) diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 282ba1eb2..2431b3f45 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -36,8 +36,6 @@ struct _CharacterClassModel: Hashable { case anyScalar /// Character.isDigit case digit - /// Character.isHexDigit - case hexDigit /// Horizontal whitespace: `[:blank:]`, i.e /// `[\p{gc=Space_Separator}\N{CHARACTER TABULATION}] case horizontalWhitespace @@ -74,7 +72,6 @@ struct _CharacterClassModel: Hashable { func isStrictAscii(options: MatchingOptions) -> Bool { switch self { case .digit: return options.usesASCIIDigits - case .hexDigit: return options.usesASCIIDigits case .horizontalWhitespace: return options.usesASCIISpaces case .newlineSequence: return options.usesASCIISpaces case .verticalWhitespace: return options.usesASCIISpaces @@ -84,22 +81,11 @@ struct _CharacterClassModel: Hashable { } } - /// Conditionally inverts a character class. - /// - /// - Parameter inversion: Indicates whether to invert the character class. - /// - Returns: The inverted character class if `inversion` is `true`; - /// otherwise, the same character class. - func withInversion(_ inversion: Bool) -> Self { - var copy = self - if inversion { - copy.isInverted.toggle() - } - return copy - } - /// Inverts a character class. var inverted: Self { - return withInversion(true) + var copy = self + copy.isInverted.toggle() + return copy } /// Returns the end of the match of this character class in the string. @@ -124,8 +110,6 @@ struct _CharacterClassModel: Hashable { next = str.unicodeScalars.index(after: i) case .digit: matched = c.isNumber && (c.isASCII || !options.usesASCIIDigits) - case .hexDigit: - matched = c.isHexDigit && (c.isASCII || !options.usesASCIIDigits) case .horizontalWhitespace: matched = c.unicodeScalars.first?.isHorizontalWhitespace == true && (c.isASCII || !options.usesASCIISpaces) @@ -153,8 +137,6 @@ struct _CharacterClassModel: Hashable { nextIndex = str.index(after: i) case .digit: matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits) - case .hexDigit: - matched = Character(c).isHexDigit && (c.isASCII || !options.usesASCIIDigits) case .horizontalWhitespace: matched = c.isHorizontalWhitespace && (c.isASCII || !options.usesASCIISpaces) case .verticalWhitespace: @@ -197,10 +179,6 @@ extension _CharacterClassModel { static var digit: _CharacterClassModel { .init(cc: .digit, matchLevel: .graphemeCluster) } - - static var hexDigit: _CharacterClassModel { - .init(cc: .hexDigit, matchLevel: .graphemeCluster) - } static var horizontalWhitespace: _CharacterClassModel { .init(cc: .horizontalWhitespace, matchLevel: .graphemeCluster) @@ -226,7 +204,6 @@ extension _CharacterClassModel.Representation: CustomStringConvertible { case .anyGrapheme: return "" case .anyScalar: return "" case .digit: return "" - case .hexDigit: return "" case .horizontalWhitespace: return "" case .newlineSequence: return "" case .verticalWhitespace: return "vertical whitespace" @@ -252,37 +229,11 @@ extension _CharacterClassModel { } } -extension AST.Atom { - var characterClass: _CharacterClassModel? { - switch kind { - case let .escaped(b): return b.characterClass - - case .property: - // TODO: Would our model type for character classes include - // this? Or does grapheme-semantic mode complicate that? - return nil - - case .dot: - // `.dot` is handled in the matching engine by Compiler.emitDot() and in - // the legacy compiler by the `.any` instruction, which can provide lower - // level instructions than the CharacterClass-generated consumer closure - // - // FIXME: We shouldn't be returning `nil` here, but instead fixing the call - // site to check for any before trying to construct a character class. - return nil - - default: return nil - - } - } - -} - -extension AST.Atom.EscapedBuiltin { - var characterClass: _CharacterClassModel? { +extension DSLTree.Atom.CharacterClass { + var model: _CharacterClassModel { switch self { - case .decimalDigit: return .digit - case .notDecimalDigit: return .digit.inverted + case .digit: return .digit + case .notDigit: return .digit.inverted case .horizontalWhitespace: return .horizontalWhitespace case .notHorizontalWhitespace: @@ -298,17 +249,14 @@ extension AST.Atom.EscapedBuiltin { case .whitespace: return .whitespace case .notWhitespace: return .whitespace.inverted - case .verticalTab: return .verticalWhitespace - case .notVerticalTab: return .verticalWhitespace.inverted - - case .wordCharacter: return .word - case .notWordCharacter: return .word.inverted + case .verticalWhitespace: return .verticalWhitespace + case .notVerticalWhitespace: return .verticalWhitespace.inverted - case .graphemeCluster: return .anyGrapheme - case .trueAnychar: return .anyUnicodeScalar + case .word: return .word + case .notWord: return .word.inverted - default: - return nil + case .anyGrapheme: return .anyGrapheme + case .anyUnicodeScalar: return .anyUnicodeScalar } } } From 2a6fe3c8ee084cb1a05a750b627e98e99c8bb299 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Mon, 18 Jul 2022 18:00:46 -0700 Subject: [PATCH 24/35] Bugfixes - matchBuiltin always fails if at endIndex - fix switch in isStrictAscii --- Sources/RegexBuilder/CharacterClass.swift | 14 +++++--------- Sources/_StringProcessing/Engine/MEBuiltins.swift | 4 ++-- .../_StringProcessing/_CharacterClassModel.swift | 2 +- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index 4cb0a5e42..08c7d347e 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -43,15 +43,11 @@ extension CharacterClass: RegexComponent { @available(SwiftStdlib 5.7, *) extension CharacterClass { public var inverted: CharacterClass { - return CharacterClass(ccc.inverted) - // lily fixme: this causes a precondition to fail in Capture.swift... why? - // why are the inverted builtins causing issues? - // Match tests are all passing -// if let inv = builtin?.inverted { -// return CharacterClass(builtin: inv) -// } else { -// return CharacterClass(ccc.inverted) -// } + if let inv = builtin?.inverted { + return CharacterClass(builtin: inv) + } else { + return CharacterClass(ccc.inverted) + } } } diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 4f14b0a06..21edc2ce4 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -8,7 +8,7 @@ extension Processor { ) -> Bool { guard let c = load() else { signalFailure() - return isInverted + return false } var matched: Bool @@ -50,7 +50,7 @@ extension Processor { ) -> Bool { guard let c = loadScalar() else { signalFailure() - return isInverted + return false } var matched: Bool diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 2431b3f45..b3fef17fb 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -70,7 +70,7 @@ struct _CharacterClassModel: Hashable { /// Returns true if this CharacterClass should be matched by strict ascii under the given options func isStrictAscii(options: MatchingOptions) -> Bool { - switch self { + switch self.cc { case .digit: return options.usesASCIIDigits case .horizontalWhitespace: return options.usesASCIISpaces case .newlineSequence: return options.usesASCIISpaces From 9ed9f576f79b36a4f4db79af15c7e8ab78ddba3f Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 19 Jul 2022 11:24:48 -0700 Subject: [PATCH 25/35] Allow strict and inverted character classes --- Sources/_StringProcessing/ByteCodeGen.swift | 15 +++++++++------ .../_StringProcessing/Engine/InstPayload.swift | 15 ++++++++++++--- Sources/_StringProcessing/Engine/MEBuilder.swift | 5 ++++- Sources/_StringProcessing/Engine/Processor.swift | 7 +++---- 4 files changed, 28 insertions(+), 14 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index dce9ae73c..7b222a0b1 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -710,11 +710,15 @@ fileprivate extension Compiler.ByteCodeGen { builder.buildQuantifyAny(kind, minTrips, extraTrips) case .characterClass(let cc): let model = cc.model - assert(!model.isStrictAscii(options: options), - "Entered emitFastQuant with an invalid case: Strict builtin character class") assert(model.consumesSingleGrapheme, "Entered emitFastQuant with an invalid case: Builtin class that does not consume a single grapheme") - builder.buildQuantify(builtin: model.cc, kind, minTrips, extraTrips) + builder.buildQuantify( + builtin: model.cc, + isStrict: model.isStrictAscii(options: options), + isInverted: model.isInverted, + kind, + minTrips, + extraTrips) default: fatalError("Entered emitFastQuant with an invalid case: DSLTree.Node.shouldDoFastQuant is out of sync") } @@ -899,9 +903,8 @@ extension DSLTree.Node { // Only quantify if we have a default behavior .any return !opts.dotMatchesNewline case .characterClass(let cc): - // Only quantify if we have a non-strict non-inverted character class - // Fixme: we can do both of these - return !cc.model.isInverted && !cc.model.isStrictAscii(options: opts) + // Only quantify if it consumes a single grapheme + return cc.model.consumesSingleGrapheme default: return false } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 1e8fca013..e0fc6dfb1 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -435,12 +435,15 @@ struct QuantifyPayload: RawRepresentable { init( builtin: _CharacterClassModel.Representation, + _ isStrict: Bool, + _ isInverted: Bool, _ kind: AST.Quantification.Kind, _ minTrips: Int, _ extraTrips: Int? ) { - assert(builtin.rawValue < 0xFF_FF) - self.rawValue = builtin.rawValue + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .builtin) + assert(builtin.rawValue < 0xFF) + let packedModel = builtin.rawValue + (isInverted ? 1 << 9 : 0) + (isStrict ? 1 << 10 : 0) + self.rawValue = packedModel + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .builtin) } var type: PayloadType { @@ -479,7 +482,13 @@ struct QuantifyPayload: RawRepresentable { } var builtin: _CharacterClassModel.Representation { - _CharacterClassModel.Representation(rawValue: self.rawValue & 0xFF_FF)! + _CharacterClassModel.Representation(rawValue: self.rawValue & 0xFF)! + } + var builtinIsInverted: Bool { + (self.rawValue >> 9) & 1 == 1 + } + var builtinIsStrict: Bool { + (self.rawValue >> 10) & 1 == 1 } } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 5c8730c40..cce9ffdff 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -239,13 +239,16 @@ extension MEProgram.Builder { mutating func buildQuantify( builtin: _CharacterClassModel.Representation, + isStrict: Bool, + isInverted: Bool, _ kind: AST.Quantification.Kind, _ minTrips: Int, _ extraTrips: Int? ) { instructions.append(.init( .quantify, - .init(quantify: .init(builtin: builtin, kind, minTrips, extraTrips)))) + .init(quantify: .init(builtin: builtin, + isStrict, isInverted, kind, minTrips, extraTrips)))) } mutating func buildAccept() { diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index ac831c46c..4909fcf00 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -352,10 +352,9 @@ extension Processor { next = _doMatchScalar( UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true) case .builtin: - // We only emit .quantify if it is non-strict ascii and if it consumes a - // single character - // fixme: also if it is not inverted (this we can fix) - next = _doMatchBuiltin(payload.builtin, false, false) + // We only emit .quantify if it consumes a single character + next = _doMatchBuiltin(payload.builtin, + payload.builtinIsInverted, payload.builtinIsStrict) case .any: // We only emit if any does not match newline // Fixme: we could emit if it matches newline by just having a bit in From bee167fb1eebd4e02bda26734c287e0b88df9719 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 19 Jul 2022 11:30:47 -0700 Subject: [PATCH 26/35] Cleanup magic constants --- .../Engine/InstPayload.swift | 39 ++++++++++++------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index e0fc6dfb1..5c085b0f7 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -373,12 +373,18 @@ struct QuantifyPayload: RawRepresentable { // b27-b18 - extraTrips (8 bit value, one bit for nil) // b18-b16 - Quantification type (one of three types) // b16-b0 - Payload value (depends on payload type) - static let quantKindShift: UInt64 = 16 - static let extraTripsShift: UInt64 = 18 - static let minTripsShift: UInt64 = 27 - static let typeShift: UInt64 = 35 + static let quantKindShift: UInt64 = 16 + static let extraTripsShift: UInt64 = 18 + static let minTripsShift: UInt64 = 27 + static let typeShift: UInt64 = 35 static let maxStorableTrips: UInt64 = (1 << 8) - 1 + var quantKindMask: UInt64 { 3 } + var extraTripsMask: UInt64 { 0x1FF } + var minTripsMask: UInt64 { 0xFF } + var typeMask: UInt64 { 7 } + var payloadMask: UInt64 { 0xFF_FF } + static func packInfoValues( _ kind: AST.Quantification.Kind, _ minTrips: Int, @@ -412,8 +418,9 @@ struct QuantifyPayload: RawRepresentable { _ minTrips: Int, _ extraTrips: Int? ) { - assert(bitset.bits < 0xFF_FF) - self.rawValue = bitset.bits + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .bitset) + assert(bitset.bits <= _payloadMask) + self.rawValue = bitset.bits + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .bitset) } init( @@ -422,7 +429,8 @@ struct QuantifyPayload: RawRepresentable { _ minTrips: Int, _ extraTrips: Int? ) { - self.rawValue = UInt64(asciiChar) + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .asciiChar) + self.rawValue = UInt64(asciiChar) + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .asciiChar) } init( @@ -442,8 +450,11 @@ struct QuantifyPayload: RawRepresentable { _ extraTrips: Int? ) { assert(builtin.rawValue < 0xFF) - let packedModel = builtin.rawValue + (isInverted ? 1 << 9 : 0) + (isStrict ? 1 << 10 : 0) - self.rawValue = packedModel + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .builtin) + let packedModel = builtin.rawValue + + (isInverted ? 1 << 9 : 0) + + (isStrict ? 1 << 10 : 0) + self.rawValue = packedModel + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .builtin) } var type: PayloadType { @@ -451,7 +462,7 @@ struct QuantifyPayload: RawRepresentable { } var quantKind: AST.Quantification.Kind { - switch (self.rawValue >> QuantifyPayload.quantKindShift) & 3 { + switch (self.rawValue >> QuantifyPayload.quantKindShift) & quantKindMask { case 0: return .eager case 1: return .reluctant case 2: return .possessive @@ -461,11 +472,11 @@ struct QuantifyPayload: RawRepresentable { } var minTrips: UInt64 { - (self.rawValue >> QuantifyPayload.minTripsShift) & 0xFF + (self.rawValue >> QuantifyPayload.minTripsShift) & minTripsMask } var extraTrips: UInt64? { - let val = (self.rawValue >> QuantifyPayload.extraTripsShift) & 0x1FF + let val = (self.rawValue >> QuantifyPayload.extraTripsShift) & extraTripsMask if val == 1 { return nil } else { @@ -474,11 +485,11 @@ struct QuantifyPayload: RawRepresentable { } var bitset: AsciiBitsetRegister { - TypedInt(self.rawValue & 0xFF_FF) + TypedInt(self.rawValue & payloadMask) } var asciiChar: UInt8 { - UInt8(asserting: self.rawValue & 0xFF) + UInt8(asserting: self.rawValue & payloadMask) } var builtin: _CharacterClassModel.Representation { From cf0175189adee3cb3676be14990e3c6c29bc0737 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 19 Jul 2022 12:18:55 -0700 Subject: [PATCH 27/35] Add specialized quantify paths --- .../_StringProcessing/Engine/MEBuiltins.swift | 2 +- .../_StringProcessing/Engine/MEQuantify.swift | 121 ++++++++++++++++++ .../_StringProcessing/Engine/Processor.swift | 75 ++--------- 3 files changed, 136 insertions(+), 62 deletions(-) create mode 100644 Sources/_StringProcessing/Engine/MEQuantify.swift diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index fb5aa2555..05231116b 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -1,7 +1,7 @@ @_implementationOnly import _RegexParser // For AssertionKind extension Processor { - mutating func _doMatchBuiltin( + func _doMatchBuiltin( _ cc: _CharacterClassModel.Representation, _ isInverted: Bool, _ isStrictAscii: Bool diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift new file mode 100644 index 000000000..1dfcbf848 --- /dev/null +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -0,0 +1,121 @@ +extension Processor { + func _doQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { + // Future work: Do we want to rework our Processor.Cycle() switch loop + // to do something like this for all of the matching instructions? + // ie: A bunch of _doMatchThing functions that return Input.Index? + // which we then signalFailure if nil or currentPosition = next otherwise + // This would have the benefit of potentially allowing us to not duplicate + // code between the normal matching instructions and this loop here + var next: Input.Index? + switch payload.type { + case .bitset: + next = _doMatchBitset(registers[payload.bitset]) + case .asciiChar: + next = _doMatchScalar( + UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true) + case .builtin: + // We only emit .quantify if it consumes a single character + next = _doMatchBuiltin(payload.builtin, + payload.builtinIsInverted, payload.builtinIsStrict) + case .any: + // We only emit if any does not match newline + // Fixme: we could emit if it matches newline by just having a bit in + // the payload, the any payload is empty anyway + let matched = currentPosition != input.endIndex && + !input[currentPosition].isNewline + next = matched ? input.index(after: currentPosition) : nil + } + return next + } + + /// Generic quantify instruction interpreter + /// - Handles .eager and .posessive + /// - Handles arbitrary minTrips and extraTrips + mutating func runQuantify(_ payload: QuantifyPayload) -> Bool { + var trips = 0 + var extraTrips = payload.extraTrips + var savePoint = startQuantifierSavePoint() + + while true { + if trips >= payload.minTrips { + if extraTrips == 0 { break } + extraTrips = extraTrips.map({$0 - 1}) + if payload.quantKind == .eager { + savePoint.updateRange(newEnd: currentPosition) + } + } + let next = _doQuantifyMatch(payload) + guard let idx = next else { break } + currentPosition = idx + trips += 1 + } + + if trips < payload.minTrips { + signalFailure() + return false + } + + if payload.quantKind == .eager && !savePoint.rangeIsEmpty { + savePoints.append(savePoint) + } + return true + } + + /// Specialized quantify instruction interpreter for * + mutating func runEagerZeroOrMoreQuantify(_ payload: QuantifyPayload) -> Bool { + assert(payload.quantKind == .eager + && payload.minTrips == 0 + && payload.extraTrips == nil) + var savePoint = startQuantifierSavePoint() + + while true { + savePoint.updateRange(newEnd: currentPosition) + let next = _doQuantifyMatch(payload) + guard let idx = next else { break } + currentPosition = idx + } + + if !savePoint.rangeIsEmpty { + savePoints.append(savePoint) + } + return true + } + + /// Specialized quantify instruction interpreter for + + mutating func runEagerOneOrMoreQuantify(_ payload: QuantifyPayload) -> Bool { + assert(payload.quantKind == .eager + && payload.minTrips == 1 + && payload.extraTrips == nil) + var savePoint = startQuantifierSavePoint() + while true { + let next = _doQuantifyMatch(payload) + guard let idx = next else { break } + currentPosition = idx + savePoint.updateRange(newEnd: currentPosition) + } + + if savePoint.rangeIsEmpty { + signalFailure() + return false + } + savePoints.append(savePoint) + return true + } + + /// Specialized quantify instruction interpreter for ? + mutating func runZeroOrOneQuantify(_ payload: QuantifyPayload) -> Bool { + assert(payload.minTrips == 0 + && payload.extraTrips == 1) + let next = _doQuantifyMatch(payload) + guard let idx = next else { + return true // matched zero times + } + if payload.quantKind != .possessive { + // Save the zero match + let savePoint = makeSavePoint(currentPC + 1) + savePoints.append(savePoint) + } + currentPosition = idx + return true + } +} diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 4909fcf00..40d3960a2 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -312,7 +312,7 @@ extension Processor { _ bitset: DSLTree.CustomCharacterClass.AsciiBitset ) -> Bool { guard let curScalar = loadScalar(), - bitset.matches(scalar: curScalar), + bitset.matches(scalar: curScalar), let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end) else { signalFailure() return false @@ -321,64 +321,6 @@ extension Processor { return true } - mutating func runQuantify(_ payload: QuantifyPayload) -> Bool { - var trips = 0 - var extraTrips = payload.extraTrips - var savePoint = startQuantifierSavePoint() - - while true { - if trips >= payload.minTrips { - // exit policy - // fixme: is there a way to optimize the next two lines out if we know - // extraTrips is nil? - if extraTrips == 0 { break } - extraTrips = extraTrips.map({$0 - 1}) - if payload.quantKind == .eager { - savePoint.updateRange(newEnd: currentPosition) - } - } - - // Future work: Do we want to rework our Processor.Cycle() switch loop - // to do something like this for all of the matching instructions? - // ie: A bunch of _doMatchThing instructions that return Input.Index? - // which we then signalFailure if nil or currentPosition = next otherwise - // This would have the benefit of potentially allowing us to not duplicate - // code between the normal matching instructions and this loop here - var next: Input.Index? - switch payload.type { - case .bitset: - next = _doMatchBitset(registers[payload.bitset]) - case .asciiChar: - next = _doMatchScalar( - UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true) - case .builtin: - // We only emit .quantify if it consumes a single character - next = _doMatchBuiltin(payload.builtin, - payload.builtinIsInverted, payload.builtinIsStrict) - case .any: - // We only emit if any does not match newline - // Fixme: we could emit if it matches newline by just having a bit in - // the payload, the any payload is empty anyway - let matched = currentPosition != input.endIndex && - !input[currentPosition].isNewline - next = matched ? input.index(after: currentPosition) : nil - } - guard let idx = next else { break } - currentPosition = idx - trips += 1 - } - - if trips < payload.minTrips { - signalFailure() - return false - } - - if payload.quantKind == .eager && !savePoint.rangeIsEmpty { - savePoints.append(savePoint) - } - return true - } - mutating func signalFailure() { guard var savePoint = savePoints.popLast() else { state = .fail @@ -576,8 +518,19 @@ extension Processor { } } case .quantify: - let quant = payload.quantify - if runQuantify(quant) { + let quantPayload = payload.quantify + let matched: Bool + switch (quantPayload.quantKind, quantPayload.minTrips, quantPayload.extraTrips) { + case (.eager, 0, nil): + matched = runEagerZeroOrMoreQuantify(quantPayload) + case (.eager, 1, nil): + matched = runEagerOneOrMoreQuantify(quantPayload) + case (_, 0, 1): + matched = runZeroOrOneQuantify(quantPayload) + default: + matched = runQuantify(quantPayload) + } + if matched { controller.step() } From e2f60d325fedf8737c6bf7c6301ce764466cd297 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 19 Jul 2022 12:46:14 -0700 Subject: [PATCH 28/35] Fix dot quantify --- Sources/_StringProcessing/ByteCodeGen.swift | 16 +++++++++------- .../_StringProcessing/Engine/InstPayload.swift | 8 +++++++- Sources/_StringProcessing/Engine/MEBuilder.swift | 3 ++- .../_StringProcessing/Engine/MEQuantify.swift | 7 ++----- 4 files changed, 20 insertions(+), 14 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 7b222a0b1..fbccc4b22 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -479,8 +479,7 @@ fileprivate extension Compiler.ByteCodeGen { // - .char // - .customCharacterClass // - built in character classes - // - .any - + // - .any, .anyNonNewline, .dot // We do this by wrapping a single instruction in a .quantify instruction if optimizationsEnabled && child.shouldDoFastQuant(options) @@ -706,8 +705,11 @@ fileprivate extension Compiler.ByteCodeGen { fatalError("Entered emitFastQuant with an invalid case: Character is not single scalar ascii") } case .any: - assert(!options.dotMatchesNewline, "Entered emitFastQuant with an invalid case: Any matches newlines") - builder.buildQuantifyAny(kind, minTrips, extraTrips) + builder.buildQuantifyAny(matchesNewlines: true, kind, minTrips, extraTrips) + case .anyNonNewline: + builder.buildQuantifyAny(matchesNewlines: false, kind, minTrips, extraTrips) + case .dot: + builder.buildQuantifyAny(matchesNewlines: options.dotMatchesNewline, kind, minTrips, extraTrips) case .characterClass(let cc): let model = cc.model assert(model.consumesSingleGrapheme, @@ -899,9 +901,9 @@ extension DSLTree.Node { case .char(let c): // Only quantify the most common path -> Single scalar ascii values return c._singleScalarAsciiValue != nil - case .any: - // Only quantify if we have a default behavior .any - return !opts.dotMatchesNewline + case .dot, .any, .anyNonNewline: + // Always quantify any/dot + return true case .characterClass(let cc): // Only quantify if it consumes a single grapheme return cc.model.consumesSingleGrapheme diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 5c085b0f7..838b08c10 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -434,11 +434,13 @@ struct QuantifyPayload: RawRepresentable { } init( + matchesNewlines: Bool, _ kind: AST.Quantification.Kind, _ minTrips: Int, _ extraTrips: Int? ) { - self.rawValue = QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .any) + self.rawValue = (matchesNewlines ? 1 : 0) + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .any) } init( @@ -492,6 +494,10 @@ struct QuantifyPayload: RawRepresentable { UInt8(asserting: self.rawValue & payloadMask) } + var anyMatchesNewline: Bool { + (self.rawValue & 1) == 1 + } + var builtin: _CharacterClassModel.Representation { _CharacterClassModel.Representation(rawValue: self.rawValue & 0xFF)! } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index cce9ffdff..70cd73973 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -228,13 +228,14 @@ extension MEProgram.Builder { } mutating func buildQuantifyAny( + matchesNewlines: Bool, _ kind: AST.Quantification.Kind, _ minTrips: Int, _ extraTrips: Int? ) { instructions.append(.init( .quantify, - .init(quantify: .init(kind, minTrips, extraTrips)))) + .init(quantify: .init(matchesNewlines: matchesNewlines, kind, minTrips, extraTrips)))) } mutating func buildQuantify( diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 1dfcbf848..47b8544bc 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -18,11 +18,8 @@ extension Processor { next = _doMatchBuiltin(payload.builtin, payload.builtinIsInverted, payload.builtinIsStrict) case .any: - // We only emit if any does not match newline - // Fixme: we could emit if it matches newline by just having a bit in - // the payload, the any payload is empty anyway - let matched = currentPosition != input.endIndex && - !input[currentPosition].isNewline + let matched = currentPosition != input.endIndex + && (!input[currentPosition].isNewline || payload.anyMatchesNewline) next = matched ? input.index(after: currentPosition) : nil } return next From d7015ec27aa0590f45c98fd39ceb721ea5773c7c Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 19 Jul 2022 13:02:59 -0700 Subject: [PATCH 29/35] Remove unneeded save point --- .../_StringProcessing/Engine/Backtracking.swift | 12 ++++++++++++ Sources/_StringProcessing/Engine/MEQuantify.swift | 14 ++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Backtracking.swift b/Sources/_StringProcessing/Engine/Backtracking.swift index fbde29917..397e16c6a 100644 --- a/Sources/_StringProcessing/Engine/Backtracking.swift +++ b/Sources/_StringProcessing/Engine/Backtracking.swift @@ -54,6 +54,18 @@ extension Processor { rangeEnd = newEnd } + mutating func dropLast(_ input: Input) { + assert(!rangeIsEmpty) + let pos = rangeEnd! + if pos == rangeStart { + // The range is now empty + rangeStart = nil + rangeEnd = nil + } else { + rangeEnd = input.index(before: pos) + } + } + mutating func removeLast(_ input: Input) -> ( pc: InstructionAddress, pos: Position?, diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 47b8544bc..ac35d1840 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -53,7 +53,11 @@ extension Processor { } if payload.quantKind == .eager && !savePoint.rangeIsEmpty { - savePoints.append(savePoint) + // The last save point has saved the current position, so it's unneeded + savePoint.dropLast(input) + if !savePoint.rangeIsEmpty { + savePoints.append(savePoint) + } } return true } @@ -72,6 +76,8 @@ extension Processor { currentPosition = idx } + // The last save point has saved the current position, so it's unneeded + savePoint.dropLast(input) if !savePoint.rangeIsEmpty { savePoints.append(savePoint) } @@ -95,7 +101,11 @@ extension Processor { signalFailure() return false } - savePoints.append(savePoint) + // The last save point has saved the current position, so it's unneeded + savePoint.dropLast(input) + if !savePoint.rangeIsEmpty { + savePoints.append(savePoint) + } return true } From b417434992aaac0c60308b29e05f2339e90235c3 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 19 Jul 2022 14:40:54 -0700 Subject: [PATCH 30/35] experimental signal failure restoring --- .../_StringProcessing/Engine/Processor.swift | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 40d3960a2..f9e0243f9 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -322,7 +322,7 @@ extension Processor { } mutating func signalFailure() { - guard var savePoint = savePoints.popLast() else { + guard let savePoint = savePoints.last else { state = .fail return } @@ -335,12 +335,13 @@ extension Processor { PositionRegister: [Input.Index] ) if !savePoint.rangeIsEmpty { - (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoint.removeLast(input) - if !savePoint.rangeIsEmpty { - savePoints.append(savePoint) + (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoints._updateOrRemoveLast { + let sp = $0.removeLast(input) + let shouldKeep = !$0.rangeIsEmpty + return (sp, shouldKeep) } } else { - (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoint.destructure + (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoints.removeLast().destructure } assert(stackEnd.rawValue <= callStack.count) @@ -637,4 +638,11 @@ extension Processor { } } - +extension Array { + mutating func _updateOrRemoveLast(_ update: (inout Element) -> (T, shouldKeep: Bool)) -> T { + let idx = index(before: endIndex) + let (val, shouldKeep) = update(&self[idx]) + if !shouldKeep { removeLast() } + return val + } +} From 512bef5717a8762ce945dd77687ec2199954cbea Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 19 Jul 2022 16:52:03 -0700 Subject: [PATCH 31/35] Reduce ARCs in signalFailure() --- .../Engine/Backtracking.swift | 32 ++++++------------- .../_StringProcessing/Engine/MEQuantify.swift | 6 ++-- .../_StringProcessing/Engine/Processor.swift | 21 ++++++------ 3 files changed, 21 insertions(+), 38 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Backtracking.swift b/Sources/_StringProcessing/Engine/Backtracking.swift index 397e16c6a..9101e31a2 100644 --- a/Sources/_StringProcessing/Engine/Backtracking.swift +++ b/Sources/_StringProcessing/Engine/Backtracking.swift @@ -41,7 +41,6 @@ extension Processor { intRegisters: [Int], PositionRegister: [Input.Index] ) { - assert(rangeIsEmpty) return (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters) } @@ -54,36 +53,23 @@ extension Processor { rangeEnd = newEnd } - mutating func dropLast(_ input: Input) { + /// Move the next range position into pos, and removing it from the range + mutating func takePositionFromRange(_ input: Input) { assert(!rangeIsEmpty) - let pos = rangeEnd! - if pos == rangeStart { - // The range is now empty - rangeStart = nil - rangeEnd = nil - } else { - rangeEnd = input.index(before: pos) - } + pos = rangeEnd! + shrinkRange(input) } - - mutating func removeLast(_ input: Input) -> ( - pc: InstructionAddress, - pos: Position?, - stackEnd: CallStackAddress, - captureEnds: [_StoredCapture], - intRegisters: [Int], - PositionRegister: [Input.Index] - ) { + + /// Shrink the range of the save point by one index, essentially dropping the last index + mutating func shrinkRange(_ input: Input) { assert(!rangeIsEmpty) - let pos = rangeEnd! - if pos == rangeStart { + if rangeEnd == rangeStart { // The range is now empty rangeStart = nil rangeEnd = nil } else { - rangeEnd = input.index(before: pos) + input.formIndex(before: &rangeEnd!) } - return (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters) } } diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index ac35d1840..508add9e0 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -54,7 +54,7 @@ extension Processor { if payload.quantKind == .eager && !savePoint.rangeIsEmpty { // The last save point has saved the current position, so it's unneeded - savePoint.dropLast(input) + savePoint.shrinkRange(input) if !savePoint.rangeIsEmpty { savePoints.append(savePoint) } @@ -77,7 +77,7 @@ extension Processor { } // The last save point has saved the current position, so it's unneeded - savePoint.dropLast(input) + savePoint.shrinkRange(input) if !savePoint.rangeIsEmpty { savePoints.append(savePoint) } @@ -102,7 +102,7 @@ extension Processor { return false } // The last save point has saved the current position, so it's unneeded - savePoint.dropLast(input) + savePoint.shrinkRange(input) if !savePoint.rangeIsEmpty { savePoints.append(savePoint) } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index f9e0243f9..efb2d1aba 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -322,10 +322,11 @@ extension Processor { } mutating func signalFailure() { - guard let savePoint = savePoints.last else { + guard !savePoints.isEmpty else { state = .fail return } + // FIXME: removing this type annotation causes xcode to crash (5.7 beta 2) let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters): ( pc: InstructionAddress, pos: Position?, @@ -333,15 +334,11 @@ extension Processor { captureEnds: [_StoredCapture], intRegisters: [Int], PositionRegister: [Input.Index] - ) - if !savePoint.rangeIsEmpty { - (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoints._updateOrRemoveLast { - let sp = $0.removeLast(input) - let shouldKeep = !$0.rangeIsEmpty - return (sp, shouldKeep) + ) = savePoints._updateOrRemoveLast { sp in + if !sp.rangeIsEmpty { + sp.takePositionFromRange(input) } - } else { - (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoints.removeLast().destructure + return (sp.destructure, shouldDrop: sp.rangeIsEmpty) } assert(stackEnd.rawValue <= callStack.count) @@ -639,10 +636,10 @@ extension Processor { } extension Array { - mutating func _updateOrRemoveLast(_ update: (inout Element) -> (T, shouldKeep: Bool)) -> T { + mutating func _updateOrRemoveLast(_ update: (inout Element) -> (T, shouldDrop: Bool)) -> T { let idx = index(before: endIndex) - let (val, shouldKeep) = update(&self[idx]) - if !shouldKeep { removeLast() } + let (val, shouldDrop) = update(&self[idx]) + if shouldDrop { removeLast() } return val } } From ff9c375159043896f189a2e7cfb42d965559e3c3 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 19 Jul 2022 17:23:36 -0700 Subject: [PATCH 32/35] Just do things inline in signalFailure() --- .../_StringProcessing/Engine/Processor.swift | 27 +++++++++---------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index efb2d1aba..8a515a7f6 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -326,7 +326,6 @@ extension Processor { state = .fail return } - // FIXME: removing this type annotation causes xcode to crash (5.7 beta 2) let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters): ( pc: InstructionAddress, pos: Position?, @@ -334,11 +333,18 @@ extension Processor { captureEnds: [_StoredCapture], intRegisters: [Int], PositionRegister: [Input.Index] - ) = savePoints._updateOrRemoveLast { sp in - if !sp.rangeIsEmpty { - sp.takePositionFromRange(input) - } - return (sp.destructure, shouldDrop: sp.rangeIsEmpty) + ) + + let idx = savePoints.index(before: savePoints.endIndex) + // If we have a quantifier save point, move the next range position into pos + if !savePoints[idx].rangeIsEmpty { + savePoints[idx].takePositionFromRange(input) + } + // If we have a normal save point or an empty quantifier save point, remove it + if savePoints[idx].rangeIsEmpty { + (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoints.removeLast().destructure + } else { + (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoints[idx].destructure } assert(stackEnd.rawValue <= callStack.count) @@ -634,12 +640,3 @@ extension Processor { } } } - -extension Array { - mutating func _updateOrRemoveLast(_ update: (inout Element) -> (T, shouldDrop: Bool)) -> T { - let idx = index(before: endIndex) - let (val, shouldDrop) = update(&self[idx]) - if shouldDrop { removeLast() } - return val - } -} From b026402a9864c3bdadbd8003c2ba4e4f18b58b4f Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 19 Jul 2022 18:07:30 -0700 Subject: [PATCH 33/35] Cleanup some comments --- Sources/_StringProcessing/ByteCodeGen.swift | 12 ++---------- Sources/_StringProcessing/Engine/MEQuantify.swift | 6 ------ 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index fbccc4b22..54e174bf5 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -682,7 +682,7 @@ fileprivate extension Compiler.ByteCodeGen { _ extraTrips: Int? ) { // These cases must stay in sync with DSLTree.Node.shouldDoFastQuant - // as well as the compilation paths for these nodes outside of quantification\ + // as well as the compilation paths for these nodes outside of quantification // All assumptions made by the processor in runQuantify() must be checked here // If an error is thrown here, there must be a mistake in shouldDoFastQuant @@ -882,19 +882,11 @@ extension DSLTree.Node { } /// If the given node can be wrapped in a .quantify instruction - /// Currently this is conservative to reduce the coupling in ByteCodeGen between the normal case and - /// the quantified cases - /// - /// Essentially we trade off implementation complexity for runtime speed by adding more true cases to this + /// Currently only allows nodes that match a single grapheme func shouldDoFastQuant(_ opts: MatchingOptions) -> Bool { switch self { case .customCharacterClass(let ccc): // Only quantify ascii only character classes - - // Future work: Should we allow ConsumeFunctions into .quantify? - // this would open up non-ascii custom character classes as well as the - // possibility of wrapping weirder cases into consume functions - // (non-ascii characters for example) return ccc.asAsciiBitset(opts) != nil case .atom(let atom): switch atom { diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 508add9e0..9ce3a51cc 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -1,11 +1,5 @@ extension Processor { func _doQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { - // Future work: Do we want to rework our Processor.Cycle() switch loop - // to do something like this for all of the matching instructions? - // ie: A bunch of _doMatchThing functions that return Input.Index? - // which we then signalFailure if nil or currentPosition = next otherwise - // This would have the benefit of potentially allowing us to not duplicate - // code between the normal matching instructions and this loop here var next: Input.Index? switch payload.type { case .bitset: From d02c5cd2277d757aa619514634f6aa46e6f92fa5 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Thu, 28 Jul 2022 18:00:05 -0700 Subject: [PATCH 34/35] Cleanup - Make emitFastQuant failable and move the checks into it - Add assertions for .reluctant - Change some static lets to static vars --- Sources/_StringProcessing/ByteCodeGen.swift | 126 +++++++----------- .../Engine/InstPayload.swift | 10 +- .../_StringProcessing/Engine/Processor.swift | 3 + 3 files changed, 54 insertions(+), 85 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 54e174bf5..7a16650a3 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -474,20 +474,7 @@ fileprivate extension Compiler.ByteCodeGen { let minTrips = low assert((extraTrips ?? 1) >= 0) - // We want to specialize common quantification cases - // Allowed nodes are: - // - .char - // - .customCharacterClass - // - built in character classes - // - .any, .anyNonNewline, .dot - // We do this by wrapping a single instruction in a .quantify instruction - if optimizationsEnabled - && child.shouldDoFastQuant(options) - && minTrips <= QuantifyPayload.maxStorableTrips - && extraTrips ?? 0 <= QuantifyPayload.maxStorableTrips - && options.matchLevel == .graphemeCluster - && updatedKind != .reluctant { - emitFastQuant(child, updatedKind, minTrips, extraTrips) + if tryEmitFastQuant(child, updatedKind, minTrips, extraTrips) { return } @@ -675,45 +662,59 @@ fileprivate extension Compiler.ByteCodeGen { builder.label(exit) } - mutating func emitFastQuant( + /// Specialized quantification instruction for repetition of certain nodes in grapheme semantic mode + /// Allowed nodes are: + /// - single ascii scalar .char + /// - ascii .customCharacterClass + /// - single grapheme consumgin built in character classes + /// - .any, .anyNonNewline, .dot + mutating func tryEmitFastQuant( _ child: DSLTree.Node, _ kind: AST.Quantification.Kind, _ minTrips: Int, _ extraTrips: Int? - ) { - // These cases must stay in sync with DSLTree.Node.shouldDoFastQuant - // as well as the compilation paths for these nodes outside of quantification - - // All assumptions made by the processor in runQuantify() must be checked here - // If an error is thrown here, there must be a mistake in shouldDoFastQuant - // letting in an invalid case + ) -> Bool { + guard optimizationsEnabled + && minTrips <= QuantifyPayload.maxStorableTrips + && extraTrips ?? 0 <= QuantifyPayload.maxStorableTrips + && options.matchLevel == .graphemeCluster + && kind != .reluctant else { + return false + } - // Coupling is bad but we do it for _speed_ switch child { case .customCharacterClass(let ccc): - if let bitset = ccc.asAsciiBitset(options) { - builder.buildQuantify(bitset: bitset, kind, minTrips, extraTrips) - } else { - fatalError("Entered emitFastQuant with an invalid case: Unable to generate bitset") + // ascii only custom character class + guard let bitset = ccc.asAsciiBitset(options) else { + return false } + builder.buildQuantify(bitset: bitset, kind, minTrips, extraTrips) + case .atom(let atom): switch atom { case .char(let c): - if let val = c._singleScalarAsciiValue { - builder.buildQuantify(asciiChar: val, kind, minTrips, extraTrips) - } else { - fatalError("Entered emitFastQuant with an invalid case: Character is not single scalar ascii") + // Single scalar ascii value character + guard let val = c._singleScalarAsciiValue else { + return false } + builder.buildQuantify(asciiChar: val, kind, minTrips, extraTrips) + case .any: - builder.buildQuantifyAny(matchesNewlines: true, kind, minTrips, extraTrips) + builder.buildQuantifyAny( + matchesNewlines: true, kind, minTrips, extraTrips) case .anyNonNewline: - builder.buildQuantifyAny(matchesNewlines: false, kind, minTrips, extraTrips) + builder.buildQuantifyAny( + matchesNewlines: false, kind, minTrips, extraTrips) case .dot: - builder.buildQuantifyAny(matchesNewlines: options.dotMatchesNewline, kind, minTrips, extraTrips) + builder.buildQuantifyAny( + matchesNewlines: options.dotMatchesNewline, kind, minTrips, extraTrips) + case .characterClass(let cc): + // Custom character class that consumes a single grapheme let model = cc.model - assert(model.consumesSingleGrapheme, - "Entered emitFastQuant with an invalid case: Builtin class that does not consume a single grapheme") + guard model.consumesSingleGrapheme else { + return false + } builder.buildQuantify( builtin: model.cc, isStrict: model.isStrictAscii(options: options), @@ -722,16 +723,20 @@ fileprivate extension Compiler.ByteCodeGen { minTrips, extraTrips) default: - fatalError("Entered emitFastQuant with an invalid case: DSLTree.Node.shouldDoFastQuant is out of sync") + return false } case .convertedRegexLiteral(let node, _): - emitFastQuant(node, kind, minTrips, extraTrips) + return tryEmitFastQuant(node, kind, minTrips, extraTrips) case .nonCapturingGroup(let groupKind, let node): - assert(groupKind.ast == .nonCapture, "Entered emitFastQuant with an invalid case: Invalid nonCapturingGroup type") - emitFastQuant(node, kind, minTrips, extraTrips) + // .nonCapture nonCapturingGroups are ignored during compilation + guard groupKind.ast == .nonCapture else { + return false + } + return tryEmitFastQuant(node, kind, minTrips, extraTrips) default: - fatalError("Entered emitFastQuant with an invalid case: DSLTree.Node.shouldDoFastQuant is out of sync") + return false } + return true } mutating func emitCustomCharacterClass( @@ -880,43 +885,4 @@ extension DSLTree.Node { default: return false } } - - /// If the given node can be wrapped in a .quantify instruction - /// Currently only allows nodes that match a single grapheme - func shouldDoFastQuant(_ opts: MatchingOptions) -> Bool { - switch self { - case .customCharacterClass(let ccc): - // Only quantify ascii only character classes - return ccc.asAsciiBitset(opts) != nil - case .atom(let atom): - switch atom { - case .char(let c): - // Only quantify the most common path -> Single scalar ascii values - return c._singleScalarAsciiValue != nil - case .dot, .any, .anyNonNewline: - // Always quantify any/dot - return true - case .characterClass(let cc): - // Only quantify if it consumes a single grapheme - return cc.model.consumesSingleGrapheme - default: - return false - } - case .convertedRegexLiteral(let node, _): - return node.shouldDoFastQuant(opts) - case .nonCapturingGroup(let kind, let child): - switch kind.ast { - case .nonCapture: - return child.shouldDoFastQuant(opts) - default: - return false - } - case .orderedChoice: - // Future work: Could we support ordered choice by compacting our payload - // representation and supporting an alternation of up to N supported nodes? - return false - default: - return false - } - } } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 838b08c10..b37252fd0 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -373,11 +373,11 @@ struct QuantifyPayload: RawRepresentable { // b27-b18 - extraTrips (8 bit value, one bit for nil) // b18-b16 - Quantification type (one of three types) // b16-b0 - Payload value (depends on payload type) - static let quantKindShift: UInt64 = 16 - static let extraTripsShift: UInt64 = 18 - static let minTripsShift: UInt64 = 27 - static let typeShift: UInt64 = 35 - static let maxStorableTrips: UInt64 = (1 << 8) - 1 + static var quantKindShift: UInt64 { 16 } + static var extraTripsShift: UInt64 { 18 } + static var minTripsShift: UInt64 { 27 } + static var typeShift: UInt64 { 35 } + static var maxStorableTrips: UInt64 { (1 << 8) - 1 } var quantKindMask: UInt64 { 3 } var extraTripsMask: UInt64 { 0x1FF } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 8a515a7f6..d13474809 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -525,6 +525,9 @@ extension Processor { let quantPayload = payload.quantify let matched: Bool switch (quantPayload.quantKind, quantPayload.minTrips, quantPayload.extraTrips) { + case (.reluctant, _, _): + assertionFailure(".reluctant is not supported by .quantify") + return case (.eager, 0, nil): matched = runEagerZeroOrMoreQuantify(quantPayload) case (.eager, 1, nil): From f90f01c6c0d3d92c26106ebedd4a95cb98dfb8b2 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Wed, 3 Aug 2022 13:27:20 -0700 Subject: [PATCH 35/35] Slight cleanup --- Sources/_StringProcessing/ByteCodeGen.swift | 9 +++-- .../Engine/Backtracking.swift | 6 ++-- .../Engine/InstPayload.swift | 33 ++++++++++--------- .../_StringProcessing/Engine/MEQuantify.swift | 18 +++++----- 4 files changed, 33 insertions(+), 33 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index bf0a18e2b..66fefc49e 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -678,7 +678,6 @@ fileprivate extension Compiler.ByteCodeGen { && kind != .reluctant else { return false } - switch child { case .customCharacterClass(let ccc): // ascii only custom character class @@ -686,7 +685,7 @@ fileprivate extension Compiler.ByteCodeGen { return false } builder.buildQuantify(bitset: bitset, kind, minTrips, extraTrips) - + case .atom(let atom): switch atom { case .char(let c): @@ -695,7 +694,7 @@ fileprivate extension Compiler.ByteCodeGen { return false } builder.buildQuantify(asciiChar: val, kind, minTrips, extraTrips) - + case .any: builder.buildQuantifyAny( matchesNewlines: true, kind, minTrips, extraTrips) @@ -705,7 +704,7 @@ fileprivate extension Compiler.ByteCodeGen { case .dot: builder.buildQuantifyAny( matchesNewlines: options.dotMatchesNewline, kind, minTrips, extraTrips) - + case .characterClass(let cc): // Custom character class that consumes a single grapheme let model = cc.asRuntimeModel(options) @@ -733,7 +732,7 @@ fileprivate extension Compiler.ByteCodeGen { } return true } - + /// Coalesce any adjacent scalar members in a custom character class together. /// This is required in order to produce correct grapheme matching behavior. func coalescingCustomCharacterClassMembers( diff --git a/Sources/_StringProcessing/Engine/Backtracking.swift b/Sources/_StringProcessing/Engine/Backtracking.swift index 9101e31a2..3ebb060c9 100644 --- a/Sources/_StringProcessing/Engine/Backtracking.swift +++ b/Sources/_StringProcessing/Engine/Backtracking.swift @@ -43,9 +43,9 @@ extension Processor { ) { return (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters) } - + var rangeIsEmpty: Bool { rangeEnd == nil } - + mutating func updateRange(newEnd: Input.Index) { if rangeStart == nil { rangeStart = newEnd @@ -59,7 +59,7 @@ extension Processor { pos = rangeEnd! shrinkRange(input) } - + /// Shrink the range of the save point by one index, essentially dropping the last index mutating func shrinkRange(_ input: Input) { assert(!rangeIsEmpty) diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 86e39875e..1e2ed757b 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -332,7 +332,9 @@ extension Instruction.Payload { ) { interpretPair() } + // MARK: Struct payloads + init(_ model: _CharacterClassModel) { self.init(CharacterClassPayload(model).rawValue) } @@ -357,14 +359,13 @@ extension Instruction.Payload { // MARK: Struct definitions struct QuantifyPayload: RawRepresentable { let rawValue: UInt64 - enum PayloadType: UInt64 { case bitset = 0 case asciiChar = 1 case any = 2 case builtin = 4 } - + // Future work: optimize this layout -> payload type should be a fast switch // The top 8 bits are reserved for the opcode so we have 56 bits to work with // b55-b38 - Unused @@ -378,13 +379,13 @@ struct QuantifyPayload: RawRepresentable { static var minTripsShift: UInt64 { 27 } static var typeShift: UInt64 { 35 } static var maxStorableTrips: UInt64 { (1 << 8) - 1 } - + var quantKindMask: UInt64 { 3 } var extraTripsMask: UInt64 { 0x1FF } var minTripsMask: UInt64 { 0xFF } var typeMask: UInt64 { 7 } var payloadMask: UInt64 { 0xFF_FF } - + static func packInfoValues( _ kind: AST.Quantification.Kind, _ minTrips: Int, @@ -406,12 +407,12 @@ struct QuantifyPayload: RawRepresentable { (UInt64(minTrips) << QuantifyPayload.minTripsShift) + (type.rawValue << QuantifyPayload.typeShift) } - + init(rawValue: UInt64) { self.rawValue = rawValue assert(rawValue & _opcodeMask == 0) } - + init( bitset: AsciiBitsetRegister, _ kind: AST.Quantification.Kind, @@ -422,7 +423,7 @@ struct QuantifyPayload: RawRepresentable { self.rawValue = bitset.bits + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .bitset) } - + init( asciiChar: UInt8, _ kind: AST.Quantification.Kind, @@ -432,7 +433,7 @@ struct QuantifyPayload: RawRepresentable { self.rawValue = UInt64(asciiChar) + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .asciiChar) } - + init( matchesNewlines: Bool, _ kind: AST.Quantification.Kind, @@ -442,7 +443,7 @@ struct QuantifyPayload: RawRepresentable { self.rawValue = (matchesNewlines ? 1 : 0) + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .any) } - + init( model: _CharacterClassModel, _ kind: AST.Quantification.Kind, @@ -457,11 +458,11 @@ struct QuantifyPayload: RawRepresentable { self.rawValue = packedModel + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .builtin) } - + var type: PayloadType { PayloadType(rawValue: (self.rawValue >> QuantifyPayload.typeShift) & 7)! } - + var quantKind: AST.Quantification.Kind { switch (self.rawValue >> QuantifyPayload.quantKindShift) & quantKindMask { case 0: return .eager @@ -471,11 +472,11 @@ struct QuantifyPayload: RawRepresentable { fatalError("Unreachable") } } - + var minTrips: UInt64 { (self.rawValue >> QuantifyPayload.minTripsShift) & minTripsMask } - + var extraTrips: UInt64? { let val = (self.rawValue >> QuantifyPayload.extraTripsShift) & extraTripsMask if val == 1 { @@ -484,15 +485,15 @@ struct QuantifyPayload: RawRepresentable { return val >> 1 } } - + var bitset: AsciiBitsetRegister { TypedInt(self.rawValue & payloadMask) } - + var asciiChar: UInt8 { UInt8(asserting: self.rawValue & payloadMask) } - + var anyMatchesNewline: Bool { (self.rawValue & 1) == 1 } diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index d150f42fa..9d17dc9bd 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -21,7 +21,7 @@ extension Processor { } return next } - + /// Generic quantify instruction interpreter /// - Handles .eager and .posessive /// - Handles arbitrary minTrips and extraTrips @@ -43,12 +43,12 @@ extension Processor { currentPosition = idx trips += 1 } - + if trips < payload.minTrips { signalFailure() return false } - + if payload.quantKind == .eager && !savePoint.rangeIsEmpty { // The last save point has saved the current position, so it's unneeded savePoint.shrinkRange(input) @@ -58,21 +58,21 @@ extension Processor { } return true } - + /// Specialized quantify instruction interpreter for * mutating func runEagerZeroOrMoreQuantify(_ payload: QuantifyPayload) -> Bool { assert(payload.quantKind == .eager && payload.minTrips == 0 && payload.extraTrips == nil) var savePoint = startQuantifierSavePoint() - + while true { savePoint.updateRange(newEnd: currentPosition) let next = _doQuantifyMatch(payload) guard let idx = next else { break } currentPosition = idx } - + // The last save point has saved the current position, so it's unneeded savePoint.shrinkRange(input) if !savePoint.rangeIsEmpty { @@ -80,7 +80,7 @@ extension Processor { } return true } - + /// Specialized quantify instruction interpreter for + mutating func runEagerOneOrMoreQuantify(_ payload: QuantifyPayload) -> Bool { assert(payload.quantKind == .eager @@ -93,7 +93,7 @@ extension Processor { currentPosition = idx savePoint.updateRange(newEnd: currentPosition) } - + if savePoint.rangeIsEmpty { signalFailure() return false @@ -105,7 +105,7 @@ extension Processor { } return true } - + /// Specialized quantify instruction interpreter for ? mutating func runZeroOrOneQuantify(_ payload: QuantifyPayload) -> Bool { assert(payload.minTrips == 0