From 3b6b676688f3bd63b7323325e4af16525615edf6 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 5 Jul 2022 14:20:36 -0700 Subject: [PATCH 01/22] Copy over new ascii bitset --- .../_StringProcessing/ConsumerInterface.swift | 7 ++ Sources/_StringProcessing/Regex/DSLTree.swift | 80 --------------- .../Utility/AsciiBitset.swift | 99 +++++++++++++++++++ 3 files changed, 106 insertions(+), 80 deletions(-) create mode 100644 Sources/_StringProcessing/Utility/AsciiBitset.swift diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index af46b5381..e4304fa6f 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -11,6 +11,13 @@ @_implementationOnly import _RegexParser +extension Character { + var _singleScalarAsciiValue: UInt8? { + guard self != "\r\n" else { return nil } + return asciiValue + } +} + extension DSLTree.Node { /// Attempt to generate a consumer from this AST node /// diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 0714c5d2c..ab6c25a59 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -162,86 +162,6 @@ extension DSLTree { indirect case subtraction(CustomCharacterClass, CustomCharacterClass) indirect case symmetricDifference(CustomCharacterClass, CustomCharacterClass) } - - internal struct AsciiBitset { - let isInverted: Bool - var a: UInt64 = 0 - var b: UInt64 = 0 - - init(isInverted: Bool) { - self.isInverted = isInverted - } - - init(_ val: UInt8, _ isInverted: Bool, _ isCaseInsensitive: Bool) { - self.isInverted = isInverted - add(val, isCaseInsensitive) - } - - init(low: UInt8, high: UInt8, isInverted: Bool, isCaseInsensitive: Bool) { - self.isInverted = isInverted - for val in low...high { - add(val, isCaseInsensitive) - } - } - - internal init( - a: UInt64, - b: UInt64, - isInverted: Bool - ) { - self.isInverted = isInverted - self.a = a - self.b = b - } - - internal mutating func add(_ val: UInt8, _ isCaseInsensitive: Bool) { - setBit(val) - if isCaseInsensitive { - switch val { - case 64...90: setBit(val + 32) - case 97...122: setBit(val - 32) - default: break - } - } - } - - internal mutating func setBit(_ val: UInt8) { - if val < 64 { - a = a | 1 << val - } else { - b = b | 1 << (val - 64) - } - } - - internal func matches(char: Character) -> Bool { - let ret: Bool - if let val = char.asciiValue { - if val < 64 { - ret = (a >> val) & 1 == 1 - } else { - ret = (b >> (val - 64)) & 1 == 1 - } - } else { - ret = false - } - - if isInverted { - return !ret - } - - return ret - } - - /// Joins another bitset from a Member of the same CustomCharacterClass - internal func union(_ other: AsciiBitset) -> AsciiBitset { - precondition(self.isInverted == other.isInverted) - return AsciiBitset( - a: self.a | other.a, - b: self.b | other.b, - isInverted: self.isInverted - ) - } - } } @_spi(RegexBuilder) diff --git a/Sources/_StringProcessing/Utility/AsciiBitset.swift b/Sources/_StringProcessing/Utility/AsciiBitset.swift new file mode 100644 index 000000000..ad3159820 --- /dev/null +++ b/Sources/_StringProcessing/Utility/AsciiBitset.swift @@ -0,0 +1,99 @@ +extension DSLTree.CustomCharacterClass { + internal struct AsciiBitset { + let isInverted: Bool + var a: UInt64 = 0 + var b: UInt64 = 0 + + init(isInverted: Bool) { + self.isInverted = isInverted + } + + init(_ val: UInt8, _ isInverted: Bool, _ isCaseInsensitive: Bool) { + self.isInverted = isInverted + add(val, isCaseInsensitive) + } + + init(low: UInt8, high: UInt8, isInverted: Bool, isCaseInsensitive: Bool) { + self.isInverted = isInverted + for val in low...high { + add(val, isCaseInsensitive) + } + } + + internal init( + a: UInt64, + b: UInt64, + isInverted: Bool + ) { + self.isInverted = isInverted + self.a = a + self.b = b + } + + internal mutating func add(_ val: UInt8, _ isCaseInsensitive: Bool) { + setBit(val) + if isCaseInsensitive { + switch val { + case 64...90: setBit(val + 32) + case 97...122: setBit(val - 32) + default: break + } + } + } + + internal mutating func setBit(_ val: UInt8) { + if val < 64 { + a = a | 1 << val + } else { + b = b | 1 << (val - 64) + } + } + + private func matches(_ val: UInt8) -> Bool { + if val < 64 { + return (a >> val) & 1 == 1 + } else { + return (b >> (val - 64)) & 1 == 1 + } + } + + internal func matches(char: Character) -> Bool { + let matched: Bool + if let val = char._singleScalarAsciiValue { + matched = matches(val) + } else { + matched = false + } + + if isInverted { + return !matched + } + return matched + } + + internal func matches(scalar: Unicode.Scalar) -> Bool { + let matched: Bool + if scalar.isASCII { + let val = UInt8(ascii: scalar) + matched = matches(val) + } else { + matched = false + } + + if isInverted { + return !matched + } + return matched + } + + /// Joins another bitset from a Member of the same CustomCharacterClass + internal func union(_ other: AsciiBitset) -> AsciiBitset { + precondition(self.isInverted == other.isInverted) + return AsciiBitset( + a: self.a | other.a, + b: self.b | other.b, + isInverted: self.isInverted + ) + } + } +} From 33caa79f06fe6a8b4690c351d913b55359166825 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 5 Jul 2022 14:21:39 -0700 Subject: [PATCH 02/22] Add matchBuiltin --- Sources/_StringProcessing/ByteCodeGen.swift | 15 ++ Sources/_StringProcessing/Compiler.swift | 3 +- .../Engine/InstPayload.swift | 16 ++ .../Engine/Instruction.swift | 2 +- .../_StringProcessing/Engine/MEBuilder.swift | 10 ++ .../_StringProcessing/Engine/MEBuiltins.swift | 4 - .../_StringProcessing/Engine/Processor.swift | 145 ++++++++++++++++-- .../_CharacterClassModel.swift | 92 +++++++++++ 8 files changed, 270 insertions(+), 17 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 820a4c721..27607706b 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -66,6 +66,11 @@ fileprivate extension Compiler.ByteCodeGen { options.apply(optionSequence.ast) case let .unconverted(astAtom): + if optimizationsEnabled, + let cc = astAtom.ast.characterClass?.builtinCC { + emitBuiltinCharacterClass(cc) + return + } if let consumer = try astAtom.ast.generateConsumer(options) { builder.buildConsume(by: consumer) } else { @@ -95,6 +100,16 @@ fileprivate extension Compiler.ByteCodeGen { throw Unsupported("Backreference kind: \(ref)") } } + + mutating func emitBuiltinCharacterClass( + _ cc: BuiltinCC + ) { + builder.buildMatchBuiltin( + cc, + cc.isStrict(options: options), + cc.asciiBitset, + isScalar: options.semanticLevel == .unicodeScalar) + } mutating func emitAssertion( _ kind: AST.Atom.AssertionKind diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index f47898e4e..88e2ce8e3 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -64,7 +64,7 @@ func _compileRegex( ) throws -> Executor { let ast = try parse(regex, syntax) let dsl: DSLTree - + print(ast) switch semanticLevel?.base { case .graphemeCluster: let sequence = AST.MatchingOptionSequence(adding: [.init(.graphemeClusterSemantics, location: .fake)]) @@ -75,6 +75,7 @@ func _compileRegex( case .none: dsl = ast.dslTree } + print(dsl) let program = try Compiler(tree: dsl).emit() return Executor(program: program) } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index c614e10fd..deabf7231 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -203,6 +203,22 @@ extension Instruction.Payload { var bitset: AsciiBitsetRegister { interpret() } + + init(_ cc: BuiltinCC, _ isStrict: Bool, _ isScalar: Bool, bitset: AsciiBitsetRegister) { + let strictBit = isStrict ? 1 << 15 : 0 + let scalarBit = isScalar ? 1 << 14 : 0 + // val must be 16 bits, reserve the top 2 bits for if it is strict ascii or scalar + assert(cc.rawValue <= 0x3F_FF) + let val = cc.rawValue + UInt64(strictBit) + UInt64(scalarBit) + self.init(val, bitset) + } + var builtinCCPayload: (cc: BuiltinCC, isStrict: Bool, isScalar: Bool, bitset: AsciiBitsetRegister) { + let (val, bitset): (UInt64, AsciiBitsetRegister) = self.interpretPair() + let cc = BuiltinCC(rawValue: val & 0x3F_FF)! + let isStrict = (val >> 15) & 1 == 1 + let isScalar = (val >> 14) & 1 == 1 + return (cc, isStrict, isScalar, bitset) + } init(consumer: ConsumeFunctionRegister) { self.init(consumer) diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 4e715ad9d..5e5d51acf 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -92,7 +92,7 @@ extension Instruction { case builtinAssertion /// TODO: builtin character classes - case builtinCharacterClass + case matchBuiltin // MARK: Extension points diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 676b21473..e1326fbd9 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -154,6 +154,16 @@ extension MEProgram.Builder { instructions.append(.init( .matchBitset, .init(bitset: makeAsciiBitset(b)))) } + + mutating func buildMatchBuiltin( + _ cc: BuiltinCC, + _ isStrict: Bool, + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, + isScalar: Bool + ) { + instructions.append(.init( + .matchBuiltin, .init(cc, isStrict, isScalar, bitset: makeAsciiBitset(bitset)))) + } mutating func buildConsume( by p: @escaping MEProgram.ConsumeFunction diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index f791da37e..720a41618 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -6,8 +6,4 @@ extension Processor { mutating func builtinAssertion() { fatalError("TODO: assertions and anchors") } - - mutating func builtinCharacterClass() { - fatalError("TODO: character classes") - } } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index a5368138c..05f91ae35 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -226,7 +226,11 @@ extension Processor { } return true } - + + func loadScalar() -> Unicode.Scalar? { + currentPosition < end ? input.unicodeScalars[currentPosition] : nil + } + // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset @@ -240,6 +244,117 @@ extension Processor { _uncheckedForcedConsumeOne() return true } + + mutating func matchBuiltin( + _ cc: BuiltinCC, + _ isStrictAscii: Bool, + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + ) -> Bool { + guard let c = load() else { + signalFailure() + return false + } + + // Fast path: See if c is a single scalar ascii character + // If so, and it matches, consume a character + // Note: CR-LF will fall through because it is not a single scalar + if bitset.matches(char: c) && cc != .anyScalar { + _uncheckedForcedConsumeOne() + return true + } + + // Slow path: Do full match + var matched: Bool + var next = input.index(after: currentPosition) + switch cc { + // lily note: when do these `any` cases appear? can they be compiled + // into consume instructions at compile time? + case .any, .anyGrapheme: matched = true + case .anyScalar: + matched = true + next = input.unicodeScalars.index(after: currentPosition) + case .digit: + matched = c.isNumber && (c.isASCII || !isStrictAscii) + case .hexDigit: + matched = c.isHexDigit && (c.isASCII || !isStrictAscii) + case .horizontalWhitespace: + matched = c.unicodeScalars.first?.isHorizontalWhitespace == true + && (c.isASCII || !isStrictAscii) + case .newlineSequence, .verticalWhitespace: + matched = c.unicodeScalars.first?.isNewline == true + && (c.isASCII || !isStrictAscii) + case .whitespace: + matched = c.isWhitespace && (c.isASCII || !isStrictAscii) + case .word: + matched = c.isWordCharacter && (c.isASCII || !isStrictAscii) + } + + if matched { + currentPosition = next + return true + } else { + signalFailure() + return false + } + } + + mutating func matchBuiltinScalar( + _ cc: BuiltinCC, + _ isStrictAscii: Bool, + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + ) -> Bool { + guard let c = loadScalar() else { + signalFailure() + return false + } + + // Fast path: See if c is a single scalar ascii character + // If so, and it matches, consume a character + // Note: CR-LF must be matched fully if we are matching a .newlineSequence + // so exclude "\r" from the fast path + if bitset.matches(scalar: c) && cc != .anyGrapheme && c != "\r" { + input.unicodeScalars.formIndex(after: ¤tPosition) + return true + } + + // Slow path: Do full match + var matched: Bool + var next = input.unicodeScalars.index(after: currentPosition) + switch cc { + case .any: matched = true + case .anyScalar: matched = true + case .anyGrapheme: + matched = true + next = input.index(after: currentPosition) + case .digit: + matched = c.properties.numericType != nil && (c.isASCII || !isStrictAscii) + case .hexDigit: + matched = Character(c).isHexDigit && (c.isASCII || !isStrictAscii) + case .horizontalWhitespace: + matched = c.isHorizontalWhitespace && (c.isASCII || !isStrictAscii) + case .verticalWhitespace: + matched = c.isNewline && (c.isASCII || !isStrictAscii) + case .newlineSequence: + matched = c.isNewline && (c.isASCII || !isStrictAscii) + // lily note: what exactly is this doing? matching a full cr-lf character + // even though its in scalar mode? why? + if c == "\r" && next != input.endIndex && input.unicodeScalars[next] == "\n" { + input.unicodeScalars.formIndex(after: &next) + } + case .whitespace: + matched = c.properties.isWhitespace && (c.isASCII || !isStrictAscii) + case .word: + matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !isStrictAscii) + } + + if matched { + currentPosition = next + return true + } else { + signalFailure() + return false + } + } mutating func signalFailure() { guard let (pc, pos, stackEnd, capEnds, intRegisters) = @@ -385,6 +500,19 @@ extension Processor { controller.step() } + case .matchBuiltin: + let (cc, isStrict, isScalar, reg) = payload.builtinCCPayload + let bitset = registers[reg] + if isScalar { + if matchBuiltinScalar(cc, isStrict, bitset) { + controller.step() + } + } else { + if matchBuiltin(cc, isStrict, bitset) { + controller.step() + } + } + case .consumeBy: let reg = payload.consumer guard currentPosition < searchBounds.upperBound, @@ -450,16 +578,14 @@ extension Processor { case .beginCapture: let capNum = Int( asserting: payload.capture.rawValue) + storedCaptures[capNum].startCapture(currentPosition) + controller.step() - storedCaptures[capNum].startCapture(currentPosition) - controller.step() - - case .endCapture: + case .endCapture: let capNum = Int( asserting: payload.capture.rawValue) - - storedCaptures[capNum].endCapture(currentPosition) - controller.step() + storedCaptures[capNum].endCapture(currentPosition) + controller.step() case .transformCapture: let (cap, trans) = payload.pairedCaptureTransform @@ -490,9 +616,6 @@ extension Processor { case .builtinAssertion: builtinAssertion() - - case .builtinCharacterClass: - builtinCharacterClass() } } } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index db2088782..87f1c708e 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -592,3 +592,95 @@ extension _CharacterClassModel { } } + +internal enum BuiltinCC: UInt64 { + case any = 1 + case anyGrapheme + case anyScalar + case digit + case hexDigit + case horizontalWhitespace + case newlineSequence + case verticalWhitespace + case whitespace + case word +} + +extension BuiltinCC { + func isStrict(options: MatchingOptions) -> Bool { + switch self { + case .digit: return options.usesASCIIDigits + case .hexDigit: return options.usesASCIIDigits + case .horizontalWhitespace: return options.usesASCIISpaces + case .newlineSequence: return options.usesASCIISpaces + case .whitespace: return options.usesASCIISpaces + case .word: return options.usesASCIIWord + default: return false + } + } + + /// A bitset representing the ascii values that this character class can match + var asciiBitset: DSLTree.CustomCharacterClass.AsciiBitset { + let allAscii = Array(0...127).map { Character(Unicode.Scalar($0)) } + let filtered: [Character] + switch self { + case .any: + filtered = allAscii + case .anyGrapheme: + filtered = allAscii + case .anyScalar: + filtered = allAscii + case .digit: + filtered = allAscii.filter { $0.isNumber } + case .hexDigit: + filtered = allAscii.filter { $0.isHexDigit } + case .horizontalWhitespace: + filtered = allAscii.filter { $0.unicodeScalars.first?.isHorizontalWhitespace == true } + case .newlineSequence: + filtered = allAscii.filter { $0.unicodeScalars.first?.isNewline == true } + case .verticalWhitespace: + filtered = allAscii.filter { $0.unicodeScalars.first?.isNewline == true } + case .whitespace: + filtered = allAscii.filter { $0.isWhitespace == true } + case .word: + filtered = allAscii.filter { $0.isWordCharacter } + } + var bitset = DSLTree.CustomCharacterClass.AsciiBitset(isInverted: false) + for c in filtered { bitset.add(c.asciiValue!, false) } + return bitset + } +} + +extension _CharacterClassModel { + internal var builtinCC: BuiltinCC? { + if isInverted { return nil } // lily todo: add another flag to the payload? when is this set? why are there so many weird edge cases in ccm? it feels like it's trying to model both builtins and custom models + + // in that case, should we just convert a ccm to a ccc + // if it has these weird flags set? + // completely remove ccm from compilation and just emit either a builtincc or a ccc or an advance + switch self.cc { + case .any: + return .any + case .anyGrapheme: + return .anyGrapheme + case .anyScalar: + return .anyScalar + case .digit: + return .digit + case .hexDigit: + return .hexDigit + case .horizontalWhitespace: + return .horizontalWhitespace + case .newlineSequence: + return .newlineSequence + case .verticalWhitespace: + return .verticalWhitespace + case .whitespace: + return .whitespace + case .word: + return .word + case .custom(_): + return nil + } + } +} From 139daa56a1c2a5a65536edceeed4ff736bb640c4 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 5 Jul 2022 14:37:10 -0700 Subject: [PATCH 03/22] Remove debug prints --- Sources/_StringProcessing/Compiler.swift | 2 -- 1 file changed, 2 deletions(-) diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 88e2ce8e3..bc6a4ec99 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -64,7 +64,6 @@ func _compileRegex( ) throws -> Executor { let ast = try parse(regex, syntax) let dsl: DSLTree - print(ast) switch semanticLevel?.base { case .graphemeCluster: let sequence = AST.MatchingOptionSequence(adding: [.init(.graphemeClusterSemantics, location: .fake)]) @@ -75,7 +74,6 @@ func _compileRegex( case .none: dsl = ast.dslTree } - print(dsl) let program = try Compiler(tree: dsl).emit() return Executor(program: program) } From 9abf4afd6309d3c57aca02ee8404d77d639c4c22 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 5 Jul 2022 16:22:07 -0700 Subject: [PATCH 04/22] Remove bitset fast path --- .../Engine/InstPayload.swift | 10 +++--- .../_StringProcessing/Engine/MEBuilder.swift | 2 +- .../_StringProcessing/Engine/Processor.swift | 31 +++---------------- 3 files changed, 11 insertions(+), 32 deletions(-) diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index deabf7231..0c10d6373 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -204,20 +204,20 @@ extension Instruction.Payload { interpret() } - init(_ cc: BuiltinCC, _ isStrict: Bool, _ isScalar: Bool, bitset: AsciiBitsetRegister) { + init(_ cc: BuiltinCC, _ isStrict: Bool, _ isScalar: Bool) { let strictBit = isStrict ? 1 << 15 : 0 let scalarBit = isScalar ? 1 << 14 : 0 // val must be 16 bits, reserve the top 2 bits for if it is strict ascii or scalar assert(cc.rawValue <= 0x3F_FF) let val = cc.rawValue + UInt64(strictBit) + UInt64(scalarBit) - self.init(val, bitset) + self.init(val) } - var builtinCCPayload: (cc: BuiltinCC, isStrict: Bool, isScalar: Bool, bitset: AsciiBitsetRegister) { - let (val, bitset): (UInt64, AsciiBitsetRegister) = self.interpretPair() + var builtinCCPayload: (cc: BuiltinCC, isStrict: Bool, isScalar: Bool) { + let val = self.rawValue let cc = BuiltinCC(rawValue: val & 0x3F_FF)! let isStrict = (val >> 15) & 1 == 1 let isScalar = (val >> 14) & 1 == 1 - return (cc, isStrict, isScalar, bitset) + return (cc, isStrict, isScalar) } init(consumer: ConsumeFunctionRegister) { diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index e1326fbd9..2ed6f599b 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -162,7 +162,7 @@ extension MEProgram.Builder { isScalar: Bool ) { instructions.append(.init( - .matchBuiltin, .init(cc, isStrict, isScalar, bitset: makeAsciiBitset(bitset)))) + .matchBuiltin, .init(cc, isStrict, isScalar))) } mutating func buildConsume( diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 05f91ae35..81e304b0f 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -247,22 +247,13 @@ extension Processor { mutating func matchBuiltin( _ cc: BuiltinCC, - _ isStrictAscii: Bool, - _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + _ isStrictAscii: Bool ) -> Bool { guard let c = load() else { signalFailure() return false } - // Fast path: See if c is a single scalar ascii character - // If so, and it matches, consume a character - // Note: CR-LF will fall through because it is not a single scalar - if bitset.matches(char: c) && cc != .anyScalar { - _uncheckedForcedConsumeOne() - return true - } - // Slow path: Do full match var matched: Bool var next = input.index(after: currentPosition) @@ -300,23 +291,12 @@ extension Processor { mutating func matchBuiltinScalar( _ cc: BuiltinCC, - _ isStrictAscii: Bool, - _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + _ isStrictAscii: Bool ) -> Bool { guard let c = loadScalar() else { signalFailure() return false } - - // Fast path: See if c is a single scalar ascii character - // If so, and it matches, consume a character - // Note: CR-LF must be matched fully if we are matching a .newlineSequence - // so exclude "\r" from the fast path - if bitset.matches(scalar: c) && cc != .anyGrapheme && c != "\r" { - input.unicodeScalars.formIndex(after: ¤tPosition) - return true - } - // Slow path: Do full match var matched: Bool var next = input.unicodeScalars.index(after: currentPosition) @@ -501,14 +481,13 @@ extension Processor { } case .matchBuiltin: - let (cc, isStrict, isScalar, reg) = payload.builtinCCPayload - let bitset = registers[reg] + let (cc, isStrict, isScalar) = payload.builtinCCPayload if isScalar { - if matchBuiltinScalar(cc, isStrict, bitset) { + if matchBuiltinScalar(cc, isStrict) { controller.step() } } else { - if matchBuiltin(cc, isStrict, bitset) { + if matchBuiltin(cc, isStrict) { controller.step() } } From 286f5d8b1ec3f0e7d2cb06ac672de86439612b3a Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 5 Jul 2022 17:15:53 -0700 Subject: [PATCH 05/22] Fully remove remnants of the bitset fast path --- Sources/_StringProcessing/ByteCodeGen.swift | 1 - .../_StringProcessing/Engine/MEBuilder.swift | 1 - .../_CharacterClassModel.swift | 31 ------------------- 3 files changed, 33 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 27607706b..1a38dec29 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -107,7 +107,6 @@ fileprivate extension Compiler.ByteCodeGen { builder.buildMatchBuiltin( cc, cc.isStrict(options: options), - cc.asciiBitset, isScalar: options.semanticLevel == .unicodeScalar) } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 2ed6f599b..3a7784f57 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -158,7 +158,6 @@ extension MEProgram.Builder { mutating func buildMatchBuiltin( _ cc: BuiltinCC, _ isStrict: Bool, - _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, isScalar: Bool ) { instructions.append(.init( diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 87f1c708e..3d23e5399 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -618,37 +618,6 @@ extension BuiltinCC { default: return false } } - - /// A bitset representing the ascii values that this character class can match - var asciiBitset: DSLTree.CustomCharacterClass.AsciiBitset { - let allAscii = Array(0...127).map { Character(Unicode.Scalar($0)) } - let filtered: [Character] - switch self { - case .any: - filtered = allAscii - case .anyGrapheme: - filtered = allAscii - case .anyScalar: - filtered = allAscii - case .digit: - filtered = allAscii.filter { $0.isNumber } - case .hexDigit: - filtered = allAscii.filter { $0.isHexDigit } - case .horizontalWhitespace: - filtered = allAscii.filter { $0.unicodeScalars.first?.isHorizontalWhitespace == true } - case .newlineSequence: - filtered = allAscii.filter { $0.unicodeScalars.first?.isNewline == true } - case .verticalWhitespace: - filtered = allAscii.filter { $0.unicodeScalars.first?.isNewline == true } - case .whitespace: - filtered = allAscii.filter { $0.isWhitespace == true } - case .word: - filtered = allAscii.filter { $0.isWordCharacter } - } - var bitset = DSLTree.CustomCharacterClass.AsciiBitset(isInverted: false) - for c in filtered { bitset.add(c.asciiValue!, false) } - return bitset - } } extension _CharacterClassModel { From e593ddb733dbd3a2217864f8963384f12651c222 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Mon, 11 Jul 2022 15:34:50 -0700 Subject: [PATCH 06/22] Completely replace AssertionFunction with regexAssert(by:) --- Sources/_StringProcessing/ByteCodeGen.swift | 136 +----------------- .../Engine/InstPayload.swift | 66 ++++++++- .../_StringProcessing/Engine/MEBuilder.swift | 29 ++-- .../_StringProcessing/Engine/MEProgram.swift | 9 -- .../_StringProcessing/Engine/Processor.swift | 125 ++++++++++++++-- .../_StringProcessing/Engine/Registers.swift | 9 -- .../Unicode/WordBreaking.swift | 33 +++++ .../_StringProcessing/Utility/TypedInt.swift | 4 - 8 files changed, 231 insertions(+), 180 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 2368791ee..d0f8c8266 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -127,136 +127,12 @@ fileprivate extension Compiler.ByteCodeGen { mutating func emitAssertion( _ kind: AST.Atom.AssertionKind ) throws { - // FIXME: Depends on API model we have... We may want to - // think through some of these with API interactions in mind - // - // This might break how we use `bounds` for both slicing - // and things like `firstIndex`, that is `firstIndex` may - // need to supply both a slice bounds and a per-search bounds. - switch kind { - case .startOfSubject: - builder.buildAssert { (_, _, input, pos, subjectBounds) in - pos == subjectBounds.lowerBound - } - - case .endOfSubjectBeforeNewline: - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input.index(after: pos) == subjectBounds.upperBound - && input[pos].isNewline - case .unicodeScalar: - return input.unicodeScalars.index(after: pos) == subjectBounds.upperBound - && input.unicodeScalars[pos].isNewline - } - } - - case .endOfSubject: - builder.buildAssert { (_, _, input, pos, subjectBounds) in - pos == subjectBounds.upperBound - } - - case .resetStartOfMatch: - // FIXME: Figure out how to communicate this out - throw Unsupported(#"\K (reset/keep assertion)"#) - - case .firstMatchingPositionInSubject: - // TODO: We can probably build a nice model with API here - - // FIXME: This needs to be based on `searchBounds`, - // not the `subjectBounds` given as an argument here - builder.buildAssert { (_, _, input, pos, subjectBounds) in false } - - case .textSegment: - builder.buildAssert { (_, _, input, pos, _) in - // FIXME: Grapheme or word based on options - input.isOnGraphemeClusterBoundary(pos) - } - - case .notTextSegment: - builder.buildAssert { (_, _, input, pos, _) in - // FIXME: Grapheme or word based on options - !input.isOnGraphemeClusterBoundary(pos) - } - - case .startOfLine: - // FIXME: Anchor.startOfLine must always use this first branch - // The behavior of `^` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.startOfLine` anchor should always match the start - // of a line. Right now we don't distinguish between those anchors. - if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.lowerBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[input.index(before: pos)].isNewline - case .unicodeScalar: - return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline - } - } - } else { - builder.buildAssert { (_, _, input, pos, subjectBounds) in - pos == subjectBounds.lowerBound - } - } - - case .endOfLine: - // FIXME: Anchor.endOfLine must always use this first branch - // The behavior of `$` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.endOfLine` anchor should always match the end - // of a line. Right now we don't distinguish between those anchors. - if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[pos].isNewline - case .unicodeScalar: - return input.unicodeScalars[pos].isNewline - } - } - } else { - builder.buildAssert { (_, _, input, pos, subjectBounds) in - pos == subjectBounds.upperBound - } - } - - case .wordBoundary: - builder.buildAssert { [options] - (cache, maxIndex, input, pos, subjectBounds) in - if options.usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? - return _CharacterClassModel.word.isBoundary( - input, - at: pos, - bounds: subjectBounds, - with: options - ) - } else { - return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex) - } - } - - case .notWordBoundary: - builder.buildAssert { [options] - (cache, maxIndex, input, pos, subjectBounds) in - if options.usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? - return !_CharacterClassModel.word.isBoundary( - input, - at: pos, - bounds: subjectBounds, - with: options - ) - } else { - return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex) - } - } - } + builder.buildAssert( + by: kind, + options.anchorsMatchNewlines, + options.usesSimpleUnicodeBoundaries, + options.usesASCIIWord, + options.semanticLevel) } mutating func emitScalar(_ s: UnicodeScalar) throws { diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 0c10d6373..137f5147a 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -8,7 +8,7 @@ // See https://swift.org/LICENSE.txt for license information // //===----------------------------------------------------------------------===// - +@_implementationOnly import _RegexParser // For AssertionKind extension Instruction { /// An instruction's payload packs operands and destination @@ -51,7 +51,6 @@ extension Instruction.Payload { case element(ElementRegister) case consumer(ConsumeFunctionRegister) case bitset(AsciiBitsetRegister) - case assertion(AssertionFunctionRegister) case addr(InstructionAddress) case capture(CaptureRegister) @@ -227,11 +226,64 @@ extension Instruction.Payload { interpret() } - init(assertion: AssertionFunctionRegister) { - self.init(assertion) - } - var assertion: AssertionFunctionRegister { - interpret() + var _assertionKindMask: UInt64 { ~0xFFF0_0000_0000_0000 } + init(assertion: AST.Atom.AssertionKind, + _ anchorsMatchNewlines: Bool, + _ usesSimpleUnicodeBoundaries: Bool, + _ usesASCIIWord: Bool, + _ semanticLevel: MatchingOptions.SemanticLevel + ) { + // 4 bits of options + let anchorBit: UInt64 = anchorsMatchNewlines ? (1 << 55) : 0 + let boundaryBit: UInt64 = usesSimpleUnicodeBoundaries ? (1 << 54) : 0 + let strictBit: UInt64 = usesASCIIWord ? (1 << 53) : 0 + let semanticLevelBit: UInt64 = semanticLevel == .unicodeScalar ? (1 << 52) : 0 + let optionsBits: UInt64 = anchorBit + boundaryBit + strictBit + semanticLevelBit + + // 4 bits for the assertion kind + // Future work: Optimize this layout + let kind: UInt64 + switch assertion { + case .endOfLine: kind = 0 + case .endOfSubject: kind = 1 + case .endOfSubjectBeforeNewline: kind = 2 + case .firstMatchingPositionInSubject: kind = 3 + case .notTextSegment: kind = 4 + case .notWordBoundary: kind = 5 + case .resetStartOfMatch: kind = 6 + case .startOfLine: kind = 7 + case .startOfSubject: kind = 8 + case .textSegment: kind = 9 + case .wordBoundary: kind = 10 + } + self.init(rawValue: kind + optionsBits) + } + var assertion: (AST.Atom.AssertionKind, Bool, Bool, Bool, MatchingOptions.SemanticLevel) { + let anchorsMatchNewlines = (self.rawValue >> 55) & 1 == 1 + let usesSimpleUnicodeBoundaries = (self.rawValue >> 54) & 1 == 1 + let usesASCIIWord = (self.rawValue >> 53) & 1 == 1 + let semanticLevel: MatchingOptions.SemanticLevel + if (self.rawValue >> 52) & 1 == 1 { + semanticLevel = .unicodeScalar + } else { + semanticLevel = .graphemeCluster + } + let kind: AST.Atom.AssertionKind + switch self.rawValue & _assertionKindMask { + case 0: kind = .endOfLine + case 1: kind = .endOfSubject + case 2: kind = .endOfSubjectBeforeNewline + case 3: kind = .firstMatchingPositionInSubject + case 4: kind = .notTextSegment + case 5: kind = .notWordBoundary + case 6: kind = .resetStartOfMatch + case 7: kind = .startOfLine + case 8: kind = .startOfSubject + case 9: kind = .textSegment + case 10: kind = .wordBoundary + default: fatalError("Unreachable") + } + return (kind, anchorsMatchNewlines, usesSimpleUnicodeBoundaries, usesASCIIWord, semanticLevel) } init(addr: InstructionAddress) { diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 3a7784f57..a4e02bb9f 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -20,7 +20,6 @@ extension MEProgram { var asciiBitsets: [DSLTree.CustomCharacterClass.AsciiBitset] = [] var consumeFunctions: [ConsumeFunction] = [] - var assertionFunctions: [AssertionFunction] = [] var transformFunctions: [TransformFunction] = [] var matcherFunctions: [MatcherFunction] = [] @@ -171,11 +170,27 @@ extension MEProgram.Builder { .consumeBy, .init(consumer: makeConsumeFunction(p)))) } +// mutating func buildAssert( +// by p: @escaping MEProgram.AssertionFunction +// ) { +// instructions.append(.init( +// .assertBy, .init(assertion: makeAssertionFunction(p)))) +// } mutating func buildAssert( - by p: @escaping MEProgram.AssertionFunction + by kind: AST.Atom.AssertionKind, + _ anchorsMatchNewlines: Bool, + _ usesSimpleUnicodeBoundaries: Bool, + _ usesASCIIWord: Bool, + _ semanticLevel: MatchingOptions.SemanticLevel ) { instructions.append(.init( - .assertBy, .init(assertion: makeAssertionFunction(p)))) + .assertBy, + .init( + assertion: kind, + anchorsMatchNewlines, + usesSimpleUnicodeBoundaries, + usesASCIIWord, + semanticLevel))) } mutating func buildAccept() { @@ -292,7 +307,6 @@ extension MEProgram.Builder { regInfo.values = nextValueRegister.rawValue regInfo.bitsets = asciiBitsets.count regInfo.consumeFunctions = consumeFunctions.count - regInfo.assertionFunctions = assertionFunctions.count regInfo.transformFunctions = transformFunctions.count regInfo.matcherFunctions = matcherFunctions.count regInfo.captures = nextCaptureRegister.rawValue @@ -303,7 +317,6 @@ extension MEProgram.Builder { staticSequences: sequences.stored, staticBitsets: asciiBitsets, staticConsumeFunctions: consumeFunctions, - staticAssertionFunctions: assertionFunctions, staticTransformFunctions: transformFunctions, staticMatcherFunctions: matcherFunctions, registerInfo: regInfo, @@ -446,12 +459,6 @@ extension MEProgram.Builder { defer { consumeFunctions.append(f) } return ConsumeFunctionRegister(consumeFunctions.count) } - mutating func makeAssertionFunction( - _ f: @escaping MEProgram.AssertionFunction - ) -> AssertionFunctionRegister { - defer { assertionFunctions.append(f) } - return AssertionFunctionRegister(assertionFunctions.count) - } mutating func makeTransformFunction( _ f: @escaping MEProgram.TransformFunction ) -> TransformRegister { diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index d311b4465..bacefb209 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -15,14 +15,6 @@ struct MEProgram { typealias Input = String typealias ConsumeFunction = (Input, Range) -> Input.Index? - typealias AssertionFunction = - ( - inout Set?, - inout String.Index?, - Input, - Input.Index, - Range - ) throws -> Bool typealias TransformFunction = (Input, Processor._StoredCapture) throws -> Any? typealias MatcherFunction = @@ -34,7 +26,6 @@ struct MEProgram { var staticSequences: [[Input.Element]] var staticBitsets: [DSLTree.CustomCharacterClass.AsciiBitset] var staticConsumeFunctions: [ConsumeFunction] - var staticAssertionFunctions: [AssertionFunction] var staticTransformFunctions: [TransformFunction] var staticMatcherFunctions: [MatcherFunction] diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 63b1a37f1..5e9d814d5 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -9,6 +9,8 @@ // //===----------------------------------------------------------------------===// +@_implementationOnly import _RegexParser // For AssertionKind + enum MatchMode { case wholeString case partialFromFront @@ -257,7 +259,6 @@ extension Processor { return false } - // Slow path: Do full match var matched: Bool var next = input.index(after: currentPosition) switch cc { @@ -300,7 +301,7 @@ extension Processor { signalFailure() return false } - // Slow path: Do full match + var matched: Bool var next = input.unicodeScalars.index(after: currentPosition) switch cc { @@ -339,6 +340,107 @@ extension Processor { } } + mutating func regexAssert( + by kind: AST.Atom.AssertionKind, + _ anchorsMatchNewlines: Bool, + _ usesSimpleUnicodeBoundaries: Bool, + _ usesASCIIWord: Bool, + _ semanticLevel: MatchingOptions.SemanticLevel + ) throws -> Bool { + // Future work: Optimize layout and dispatch + + // FIXME: Depends on API model we have... We may want to + // think through some of these with API interactions in mind + // + // This might break how we use `bounds` for both slicing + // and things like `firstIndex`, that is `firstIndex` may + // need to supply both a slice bounds and a per-search bounds. + switch kind { + case .startOfSubject: return currentPosition == subjectBounds.lowerBound + + case .endOfSubjectBeforeNewline: + if currentPosition == subjectBounds.upperBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input.index(after: currentPosition) == subjectBounds.upperBound + && input[currentPosition].isNewline + case .unicodeScalar: + return input.unicodeScalars.index(after: currentPosition) == subjectBounds.upperBound + && input.unicodeScalars[currentPosition].isNewline + } + + case .endOfSubject: return currentPosition == subjectBounds.upperBound + + case .resetStartOfMatch: + // FIXME: Figure out how to communicate this out + throw Unsupported(#"\K (reset/keep assertion)"#) + + case .firstMatchingPositionInSubject: + // TODO: We can probably build a nice model with API here + + // FIXME: This needs to be based on `searchBounds`, + // not the `subjectBounds` given as an argument here + // (Note: the above fixme was in reference to the old assert function API. + // Now that we're in processor, we have access to searchBounds) + return false + + case .textSegment: return input.isOnGraphemeClusterBoundary(currentPosition) + + case .notTextSegment: return !input.isOnGraphemeClusterBoundary(currentPosition) + + case .startOfLine: + // FIXME: Anchor.startOfLine must always use this first branch + // The behavior of `^` should depend on `anchorsMatchNewlines`, but + // the DSL-based `.startOfLine` anchor should always match the start + // of a line. Right now we don't distinguish between those anchors. + if anchorsMatchNewlines { + if currentPosition == subjectBounds.lowerBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input[input.index(before: currentPosition)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline + } + } else { + return currentPosition == subjectBounds.lowerBound + } + + case .endOfLine: + // FIXME: Anchor.endOfLine must always use this first branch + // The behavior of `$` should depend on `anchorsMatchNewlines`, but + // the DSL-based `.endOfLine` anchor should always match the end + // of a line. Right now we don't distinguish between those anchors. + if anchorsMatchNewlines { + if currentPosition == subjectBounds.upperBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input[currentPosition].isNewline + case .unicodeScalar: + return input.unicodeScalars[currentPosition].isNewline + } + } else { + return currentPosition == subjectBounds.upperBound + } + + case .wordBoundary: + if usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return atSimpleBoundary(usesASCIIWord, semanticLevel) + // lily note: there appear to be no test cases that use this option, ping alex to ask what they should look like + } else { + return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + } + + case .notWordBoundary: + if usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return !atSimpleBoundary(usesASCIIWord, semanticLevel) + } else { + return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + } + } + } + mutating func signalFailure() { guard let (pc, pos, stackEnd, capEnds, intRegisters) = savePoints.popLast()?.destructure @@ -508,15 +610,18 @@ extension Processor { controller.step() case .assertBy: - let reg = payload.assertion - let assertion = registers[reg] + let (kind, + anchorsMatchNewlines, + usesSimpleUnicodeBoundaries, + usesASCIIWord, + semanticLevel) = payload.assertion do { - guard try assertion( - &wordIndexCache, - &wordIndexMaxIndex, - input, - currentPosition, - subjectBounds + guard try regexAssert( + by: kind, + anchorsMatchNewlines, + usesSimpleUnicodeBoundaries, + usesASCIIWord, + semanticLevel ) else { signalFailure() return diff --git a/Sources/_StringProcessing/Engine/Registers.swift b/Sources/_StringProcessing/Engine/Registers.swift index c76413383..edc325a30 100644 --- a/Sources/_StringProcessing/Engine/Registers.swift +++ b/Sources/_StringProcessing/Engine/Registers.swift @@ -33,8 +33,6 @@ extension Processor { var consumeFunctions: [MEProgram.ConsumeFunction] - var assertionFunctions: [MEProgram.AssertionFunction] - // Captured-value constructors var transformFunctions: [MEProgram.TransformFunction] @@ -77,9 +75,6 @@ extension Processor.Registers { subscript(_ i: ConsumeFunctionRegister) -> MEProgram.ConsumeFunction { consumeFunctions[i.rawValue] } - subscript(_ i: AssertionFunctionRegister) -> MEProgram.AssertionFunction { - assertionFunctions[i.rawValue] - } subscript(_ i: TransformRegister) -> MEProgram.TransformFunction { transformFunctions[i.rawValue] } @@ -107,9 +102,6 @@ extension Processor.Registers { self.consumeFunctions = program.staticConsumeFunctions assert(consumeFunctions.count == info.consumeFunctions) - self.assertionFunctions = program.staticAssertionFunctions - assert(assertionFunctions.count == info.assertionFunctions) - self.transformFunctions = program.staticTransformFunctions assert(transformFunctions.count == info.transformFunctions) @@ -145,7 +137,6 @@ extension MEProgram { var strings = 0 var bitsets = 0 var consumeFunctions = 0 - var assertionFunctions = 0 var transformFunctions = 0 var matcherFunctions = 0 var ints = 0 diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift index 94c311e82..50da079f6 100644 --- a/Sources/_StringProcessing/Unicode/WordBreaking.swift +++ b/Sources/_StringProcessing/Unicode/WordBreaking.swift @@ -12,6 +12,39 @@ @_spi(_Unicode) import Swift +extension Processor { + func atSimpleBoundary( + _ usesAsciiWord: Bool, + _ semanticLevel: MatchingOptions.SemanticLevel + ) -> Bool { + func matchesWord(at i: Input.Index) -> Bool { + switch semanticLevel { + case .graphemeCluster: + let c = input[i] + return c.isWordCharacter && (c.isASCII || !usesAsciiWord) + case .unicodeScalar: + let c = input.unicodeScalars[i] + return (c.properties.isAlphabetic || c == "_") && (c.isASCII || !usesAsciiWord) + } + } + + // FIXME: How should we handle bounds? + // We probably need two concepts + if subjectBounds.isEmpty { return false } + if currentPosition == subjectBounds.lowerBound { + return matchesWord(at: currentPosition) + } + let priorIdx = input.index(before: currentPosition) + if currentPosition == subjectBounds.upperBound { + return matchesWord(at: priorIdx) + } + + let prior = matchesWord(at: priorIdx) + let current = matchesWord(at: currentPosition) + return prior != current + } +} + extension String { func isOnWordBoundary( at i: String.Index, diff --git a/Sources/_StringProcessing/Utility/TypedInt.swift b/Sources/_StringProcessing/Utility/TypedInt.swift index adc9edf78..e03f2572f 100644 --- a/Sources/_StringProcessing/Utility/TypedInt.swift +++ b/Sources/_StringProcessing/Utility/TypedInt.swift @@ -142,10 +142,6 @@ enum _AsciiBitsetRegister {} typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister> enum _ConsumeFunctionRegister {} -/// Used for assertion functions, e.g. anchors etc -typealias AssertionFunctionRegister = TypedInt<_AssertionFunctionRegister> -enum _AssertionFunctionRegister {} - /// Used for capture transforms, etc typealias TransformRegister = TypedInt<_TransformRegister> enum _TransformRegister {} From 3e38ac6026b956396a8eac55b2dfd79892968cf7 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Mon, 11 Jul 2022 18:13:20 -0700 Subject: [PATCH 07/22] Cleanup --- Sources/_StringProcessing/ByteCodeGen.swift | 14 ++----- Sources/_StringProcessing/Compiler.swift | 1 + .../Engine/Instruction.swift | 4 -- .../_StringProcessing/Engine/MEBuilder.swift | 6 --- .../_StringProcessing/Engine/MEBuiltins.swift | 9 ----- .../_StringProcessing/Engine/Processor.swift | 8 ---- .../_CharacterClassModel.swift | 37 ++----------------- 7 files changed, 8 insertions(+), 71 deletions(-) delete mode 100644 Sources/_StringProcessing/Engine/MEBuiltins.swift diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index a4e32f729..8e7aad6cd 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -82,7 +82,10 @@ fileprivate extension Compiler.ByteCodeGen { case let .unconverted(astAtom): if optimizationsEnabled, let cc = astAtom.ast.characterClass?.builtinCC { - emitBuiltinCharacterClass(cc) + builder.buildMatchBuiltin( + cc, + cc.isStrict(options: options), + isScalar: options.semanticLevel == .unicodeScalar) return } if let consumer = try astAtom.ast.generateConsumer(options) { @@ -114,15 +117,6 @@ fileprivate extension Compiler.ByteCodeGen { throw Unsupported("Backreference kind: \(ref)") } } - - mutating func emitBuiltinCharacterClass( - _ cc: BuiltinCC - ) { - builder.buildMatchBuiltin( - cc, - cc.isStrict(options: options), - isScalar: options.semanticLevel == .unicodeScalar) - } mutating func emitAssertion( _ kind: AST.Atom.AssertionKind diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 97466dd66..530126a32 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -68,6 +68,7 @@ func _compileRegex( ) throws -> Executor { let ast = try parse(regex, syntax) let dsl: DSLTree + switch semanticLevel?.base { case .graphemeCluster: let sequence = AST.MatchingOptionSequence(adding: [.init(.graphemeClusterSemantics, location: .fake)]) diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index a2ffa2d8a..6bccb294b 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -106,10 +106,6 @@ extension Instruction { /// Operand: Ascii bitset register containing the bitset case matchBitset - /// TODO: builtin assertions and anchors - case builtinAssertion - - /// TODO: builtin character classes case matchBuiltin // MARK: Extension points diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 3378369c4..fd8b97beb 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -179,12 +179,6 @@ extension MEProgram.Builder { .consumeBy, .init(consumer: makeConsumeFunction(p)))) } -// mutating func buildAssert( -// by p: @escaping MEProgram.AssertionFunction -// ) { -// instructions.append(.init( -// .assertBy, .init(assertion: makeAssertionFunction(p)))) -// } mutating func buildAssert( by kind: AST.Atom.AssertionKind, _ anchorsMatchNewlines: Bool, diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift deleted file mode 100644 index 720a41618..000000000 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ /dev/null @@ -1,9 +0,0 @@ - - -extension Processor { - - - mutating func builtinAssertion() { - fatalError("TODO: assertions and anchors") - } -} diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index c4fc139ae..5c557dc81 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -262,8 +262,6 @@ extension Processor { var matched: Bool var next = input.index(after: currentPosition) switch cc { - // lily note: when do these `any` cases appear? can they be compiled - // into consume instructions at compile time? case .any, .anyGrapheme: matched = true case .anyScalar: matched = true @@ -320,8 +318,6 @@ extension Processor { matched = c.isNewline && (c.isASCII || !isStrictAscii) case .newlineSequence: matched = c.isNewline && (c.isASCII || !isStrictAscii) - // lily note: what exactly is this doing? matching a full cr-lf character - // even though its in scalar mode? why? if c == "\r" && next != input.endIndex && input.unicodeScalars[next] == "\n" { input.unicodeScalars.formIndex(after: &next) } @@ -426,7 +422,6 @@ extension Processor { if usesSimpleUnicodeBoundaries { // TODO: How should we handle bounds? return atSimpleBoundary(usesASCIIWord, semanticLevel) - // lily note: there appear to be no test cases that use this option, ping alex to ask what they should look like } else { return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) } @@ -716,9 +711,6 @@ extension Processor { storedCaptures[capNum].registerValue( value, overwriteInitial: sp) controller.step() - - case .builtinAssertion: - builtinAssertion() } } } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 3d23e5399..d91dd4f65 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -564,35 +564,6 @@ extension DSLTree.CustomCharacterClass { } } -extension _CharacterClassModel { - // FIXME: Calling on inverted sets wont be the same as the - // inverse of a boundary if at the start or end of the - // string. (Think through what we want: do it ourselves or - // give the caller both options). - func isBoundary( - _ input: String, - at pos: String.Index, - bounds: Range, - with options: MatchingOptions - ) -> Bool { - // FIXME: How should we handle bounds? - // We probably need two concepts - if bounds.isEmpty { return false } - if pos == bounds.lowerBound { - return self.matches(in: input, at: pos, with: options) != nil - } - let priorIdx = input.index(before: pos) - if pos == bounds.upperBound { - return self.matches(in: input, at: priorIdx, with: options) != nil - } - - let prior = self.matches(in: input, at: priorIdx, with: options) != nil - let current = self.matches(in: input, at: pos, with: options) != nil - return prior != current - } - -} - internal enum BuiltinCC: UInt64 { case any = 1 case anyGrapheme @@ -622,11 +593,9 @@ extension BuiltinCC { extension _CharacterClassModel { internal var builtinCC: BuiltinCC? { - if isInverted { return nil } // lily todo: add another flag to the payload? when is this set? why are there so many weird edge cases in ccm? it feels like it's trying to model both builtins and custom models - - // in that case, should we just convert a ccm to a ccc - // if it has these weird flags set? - // completely remove ccm from compilation and just emit either a builtincc or a ccc or an advance + // Future work: Make CCM always either a BuiltinCC or convertable to a + // custom character class + if isInverted { return nil } switch self.cc { case .any: return .any From e5d8b4a05dfb5e4b2017eb8a03f34ff47ddd53c4 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 12 Jul 2022 12:19:45 -0700 Subject: [PATCH 08/22] Move match builtin and assert + Add AssertionPayload --- .../Engine/InstPayload.swift | 64 +---- .../_StringProcessing/Engine/MEBuilder.swift | 13 +- .../_StringProcessing/Engine/MEBuiltins.swift | 256 ++++++++++++++++++ .../_StringProcessing/Engine/Processor.swift | 201 +------------- 4 files changed, 270 insertions(+), 264 deletions(-) create mode 100644 Sources/_StringProcessing/Engine/MEBuiltins.swift diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 5a3d0a55d..31ff6c369 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -8,7 +8,6 @@ // See https://swift.org/LICENSE.txt for license information // //===----------------------------------------------------------------------===// -@_implementationOnly import _RegexParser // For AssertionKind extension Instruction { /// An instruction's payload packs operands and destination @@ -226,64 +225,11 @@ extension Instruction.Payload { interpret() } - var _assertionKindMask: UInt64 { ~0xFFF0_0000_0000_0000 } - init(assertion: AST.Atom.AssertionKind, - _ anchorsMatchNewlines: Bool, - _ usesSimpleUnicodeBoundaries: Bool, - _ usesASCIIWord: Bool, - _ semanticLevel: MatchingOptions.SemanticLevel - ) { - // 4 bits of options - let anchorBit: UInt64 = anchorsMatchNewlines ? (1 << 55) : 0 - let boundaryBit: UInt64 = usesSimpleUnicodeBoundaries ? (1 << 54) : 0 - let strictBit: UInt64 = usesASCIIWord ? (1 << 53) : 0 - let semanticLevelBit: UInt64 = semanticLevel == .unicodeScalar ? (1 << 52) : 0 - let optionsBits: UInt64 = anchorBit + boundaryBit + strictBit + semanticLevelBit - - // 4 bits for the assertion kind - // Future work: Optimize this layout - let kind: UInt64 - switch assertion { - case .endOfLine: kind = 0 - case .endOfSubject: kind = 1 - case .endOfSubjectBeforeNewline: kind = 2 - case .firstMatchingPositionInSubject: kind = 3 - case .notTextSegment: kind = 4 - case .notWordBoundary: kind = 5 - case .resetStartOfMatch: kind = 6 - case .startOfLine: kind = 7 - case .startOfSubject: kind = 8 - case .textSegment: kind = 9 - case .wordBoundary: kind = 10 - } - self.init(rawValue: kind + optionsBits) - } - var assertion: (AST.Atom.AssertionKind, Bool, Bool, Bool, MatchingOptions.SemanticLevel) { - let anchorsMatchNewlines = (self.rawValue >> 55) & 1 == 1 - let usesSimpleUnicodeBoundaries = (self.rawValue >> 54) & 1 == 1 - let usesASCIIWord = (self.rawValue >> 53) & 1 == 1 - let semanticLevel: MatchingOptions.SemanticLevel - if (self.rawValue >> 52) & 1 == 1 { - semanticLevel = .unicodeScalar - } else { - semanticLevel = .graphemeCluster - } - let kind: AST.Atom.AssertionKind - switch self.rawValue & _assertionKindMask { - case 0: kind = .endOfLine - case 1: kind = .endOfSubject - case 2: kind = .endOfSubjectBeforeNewline - case 3: kind = .firstMatchingPositionInSubject - case 4: kind = .notTextSegment - case 5: kind = .notWordBoundary - case 6: kind = .resetStartOfMatch - case 7: kind = .startOfLine - case 8: kind = .startOfSubject - case 9: kind = .textSegment - case 10: kind = .wordBoundary - default: fatalError("Unreachable") - } - return (kind, anchorsMatchNewlines, usesSimpleUnicodeBoundaries, usesASCIIWord, semanticLevel) + init(assertion payload: AssertionPayload) { + self.init(rawValue: payload.rawValue) + } + var assertion: AssertionPayload { + AssertionPayload.init(rawValue: self.rawValue & _payloadMask) } init(addr: InstructionAddress) { diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index fd8b97beb..cc1beac60 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -186,14 +186,15 @@ extension MEProgram.Builder { _ usesASCIIWord: Bool, _ semanticLevel: MatchingOptions.SemanticLevel ) { + let payload = AssertionPayload.init( + kind, + anchorsMatchNewlines, + usesSimpleUnicodeBoundaries, + usesASCIIWord, + semanticLevel) instructions.append(.init( .assertBy, - .init( - assertion: kind, - anchorsMatchNewlines, - usesSimpleUnicodeBoundaries, - usesASCIIWord, - semanticLevel))) + .init(assertion: payload))) } mutating func buildAccept() { diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift new file mode 100644 index 000000000..8d7989a50 --- /dev/null +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -0,0 +1,256 @@ +@_implementationOnly import _RegexParser // For AssertionKind + +extension Processor { + mutating func matchBuiltin( + _ cc: BuiltinCC, + _ isStrictAscii: Bool + ) -> Bool { + guard let c = load() else { + signalFailure() + return false + } + + var matched: Bool + var next = input.index(after: currentPosition) + switch cc { + case .any, .anyGrapheme: matched = true + case .anyScalar: + matched = true + next = input.unicodeScalars.index(after: currentPosition) + case .digit: + matched = c.isNumber && (c.isASCII || !isStrictAscii) + case .hexDigit: + matched = c.isHexDigit && (c.isASCII || !isStrictAscii) + case .horizontalWhitespace: + matched = c.unicodeScalars.first?.isHorizontalWhitespace == true + && (c.isASCII || !isStrictAscii) + case .newlineSequence, .verticalWhitespace: + matched = c.unicodeScalars.first?.isNewline == true + && (c.isASCII || !isStrictAscii) + case .whitespace: + matched = c.isWhitespace && (c.isASCII || !isStrictAscii) + case .word: + matched = c.isWordCharacter && (c.isASCII || !isStrictAscii) + } + + if matched { + currentPosition = next + return true + } else { + signalFailure() + return false + } + } + + mutating func matchBuiltinScalar( + _ cc: BuiltinCC, + _ isStrictAscii: Bool + ) -> Bool { + guard let c = loadScalar() else { + signalFailure() + return false + } + + var matched: Bool + var next = input.unicodeScalars.index(after: currentPosition) + switch cc { + case .any: matched = true + case .anyScalar: matched = true + case .anyGrapheme: + matched = true + next = input.index(after: currentPosition) + case .digit: + matched = c.properties.numericType != nil && (c.isASCII || !isStrictAscii) + case .hexDigit: + matched = Character(c).isHexDigit && (c.isASCII || !isStrictAscii) + case .horizontalWhitespace: + matched = c.isHorizontalWhitespace && (c.isASCII || !isStrictAscii) + case .verticalWhitespace: + matched = c.isNewline && (c.isASCII || !isStrictAscii) + case .newlineSequence: + matched = c.isNewline && (c.isASCII || !isStrictAscii) + if c == "\r" && next != input.endIndex && input.unicodeScalars[next] == "\n" { + input.unicodeScalars.formIndex(after: &next) + } + case .whitespace: + matched = c.properties.isWhitespace && (c.isASCII || !isStrictAscii) + case .word: + matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !isStrictAscii) + } + + if matched { + currentPosition = next + return true + } else { + signalFailure() + return false + } + } + + mutating func regexAssert(by payload: AssertionPayload) throws -> Bool { + // Future work: Optimize layout and dispatch + + // FIXME: Depends on API model we have... We may want to + // think through some of these with API interactions in mind + // + // This might break how we use `bounds` for both slicing + // and things like `firstIndex`, that is `firstIndex` may + // need to supply both a slice bounds and a per-search bounds. + switch payload.kind { + case .startOfSubject: return currentPosition == subjectBounds.lowerBound + + case .endOfSubjectBeforeNewline: + if currentPosition == subjectBounds.upperBound { return true } + switch payload.semanticLevel { + case .graphemeCluster: + return input.index(after: currentPosition) == subjectBounds.upperBound + && input[currentPosition].isNewline + case .unicodeScalar: + return input.unicodeScalars.index(after: currentPosition) == subjectBounds.upperBound + && input.unicodeScalars[currentPosition].isNewline + } + + case .endOfSubject: return currentPosition == subjectBounds.upperBound + + case .resetStartOfMatch: + // FIXME: Figure out how to communicate this out + throw Unsupported(#"\K (reset/keep assertion)"#) + + case .firstMatchingPositionInSubject: + // TODO: We can probably build a nice model with API here + + // FIXME: This needs to be based on `searchBounds`, + // not the `subjectBounds` given as an argument here + // (Note: the above fixme was in reference to the old assert function API. + // Now that we're in processor, we have access to searchBounds) + return false + + case .textSegment: return input.isOnGraphemeClusterBoundary(currentPosition) + + case .notTextSegment: return !input.isOnGraphemeClusterBoundary(currentPosition) + + case .startOfLine: + // FIXME: Anchor.startOfLine must always use this first branch + // The behavior of `^` should depend on `anchorsMatchNewlines`, but + // the DSL-based `.startOfLine` anchor should always match the start + // of a line. Right now we don't distinguish between those anchors. + if payload.anchorsMatchNewlines { + if currentPosition == subjectBounds.lowerBound { return true } + switch payload.semanticLevel { + case .graphemeCluster: + return input[input.index(before: currentPosition)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline + } + } else { + return currentPosition == subjectBounds.lowerBound + } + + case .endOfLine: + // FIXME: Anchor.endOfLine must always use this first branch + // The behavior of `$` should depend on `anchorsMatchNewlines`, but + // the DSL-based `.endOfLine` anchor should always match the end + // of a line. Right now we don't distinguish between those anchors. + if payload.anchorsMatchNewlines { + if currentPosition == subjectBounds.upperBound { return true } + switch payload.semanticLevel { + case .graphemeCluster: + return input[currentPosition].isNewline + case .unicodeScalar: + return input.unicodeScalars[currentPosition].isNewline + } + } else { + return currentPosition == subjectBounds.upperBound + } + + case .wordBoundary: + if payload.usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) + } else { + return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + } + + case .notWordBoundary: + if payload.usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return !atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) + } else { + return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + } + } + } +} + +struct AssertionPayload: RawRepresentable { + var _assertionKindMask: UInt64 { ~0xFFF0_0000_0000_0000 } + var _opcodeMask: UInt64 { 0xFF00_0000_0000_0000 } + + let rawValue: UInt64 + + init(rawValue: UInt64) { + self.rawValue = rawValue + assert(rawValue & _opcodeMask == 0) + } + + init(_ assertion: AST.Atom.AssertionKind, + _ anchorsMatchNewlines: Bool, + _ usesSimpleUnicodeBoundaries: Bool, + _ usesASCIIWord: Bool, + _ semanticLevel: MatchingOptions.SemanticLevel + ) { + // 4 bits of options + let anchorBit: UInt64 = anchorsMatchNewlines ? (1 << 55) : 0 + let boundaryBit: UInt64 = usesSimpleUnicodeBoundaries ? (1 << 54) : 0 + let strictBit: UInt64 = usesASCIIWord ? (1 << 53) : 0 + let semanticLevelBit: UInt64 = semanticLevel == .unicodeScalar ? (1 << 52) : 0 + let optionsBits: UInt64 = anchorBit + boundaryBit + strictBit + semanticLevelBit + + // 4 bits for the assertion kind + // Future work: Optimize this layout + let kind: UInt64 + switch assertion { + case .endOfLine: kind = 0 + case .endOfSubject: kind = 1 + case .endOfSubjectBeforeNewline: kind = 2 + case .firstMatchingPositionInSubject: kind = 3 + case .notTextSegment: kind = 4 + case .notWordBoundary: kind = 5 + case .resetStartOfMatch: kind = 6 + case .startOfLine: kind = 7 + case .startOfSubject: kind = 8 + case .textSegment: kind = 9 + case .wordBoundary: kind = 10 + } + self.init(rawValue: kind + optionsBits) + } + + var kind: AST.Atom.AssertionKind { + let kind: AST.Atom.AssertionKind + switch self.rawValue & _assertionKindMask { + case 0: kind = .endOfLine + case 1: kind = .endOfSubject + case 2: kind = .endOfSubjectBeforeNewline + case 3: kind = .firstMatchingPositionInSubject + case 4: kind = .notTextSegment + case 5: kind = .notWordBoundary + case 6: kind = .resetStartOfMatch + case 7: kind = .startOfLine + case 8: kind = .startOfSubject + case 9: kind = .textSegment + case 10: kind = .wordBoundary + default: fatalError("Unreachable") + } + return kind + } + var anchorsMatchNewlines: Bool { (self.rawValue >> 55) & 1 == 1 } + var usesSimpleUnicodeBoundaries: Bool { (self.rawValue >> 54) & 1 == 1 } + var usesASCIIWord: Bool { (self.rawValue >> 53) & 1 == 1 } + var semanticLevel: MatchingOptions.SemanticLevel { + if (self.rawValue >> 52) & 1 == 1 { + return .unicodeScalar + } else { + return .graphemeCluster + } + } +} diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 5c557dc81..b5bf804de 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -9,7 +9,6 @@ // //===----------------------------------------------------------------------===// -@_implementationOnly import _RegexParser // For AssertionKind enum MatchMode { case wholeString @@ -249,192 +248,6 @@ extension Processor { _uncheckedForcedConsumeOne() return true } - - mutating func matchBuiltin( - _ cc: BuiltinCC, - _ isStrictAscii: Bool - ) -> Bool { - guard let c = load() else { - signalFailure() - return false - } - - var matched: Bool - var next = input.index(after: currentPosition) - switch cc { - case .any, .anyGrapheme: matched = true - case .anyScalar: - matched = true - next = input.unicodeScalars.index(after: currentPosition) - case .digit: - matched = c.isNumber && (c.isASCII || !isStrictAscii) - case .hexDigit: - matched = c.isHexDigit && (c.isASCII || !isStrictAscii) - case .horizontalWhitespace: - matched = c.unicodeScalars.first?.isHorizontalWhitespace == true - && (c.isASCII || !isStrictAscii) - case .newlineSequence, .verticalWhitespace: - matched = c.unicodeScalars.first?.isNewline == true - && (c.isASCII || !isStrictAscii) - case .whitespace: - matched = c.isWhitespace && (c.isASCII || !isStrictAscii) - case .word: - matched = c.isWordCharacter && (c.isASCII || !isStrictAscii) - } - - if matched { - currentPosition = next - return true - } else { - signalFailure() - return false - } - } - - mutating func matchBuiltinScalar( - _ cc: BuiltinCC, - _ isStrictAscii: Bool - ) -> Bool { - guard let c = loadScalar() else { - signalFailure() - return false - } - - var matched: Bool - var next = input.unicodeScalars.index(after: currentPosition) - switch cc { - case .any: matched = true - case .anyScalar: matched = true - case .anyGrapheme: - matched = true - next = input.index(after: currentPosition) - case .digit: - matched = c.properties.numericType != nil && (c.isASCII || !isStrictAscii) - case .hexDigit: - matched = Character(c).isHexDigit && (c.isASCII || !isStrictAscii) - case .horizontalWhitespace: - matched = c.isHorizontalWhitespace && (c.isASCII || !isStrictAscii) - case .verticalWhitespace: - matched = c.isNewline && (c.isASCII || !isStrictAscii) - case .newlineSequence: - matched = c.isNewline && (c.isASCII || !isStrictAscii) - if c == "\r" && next != input.endIndex && input.unicodeScalars[next] == "\n" { - input.unicodeScalars.formIndex(after: &next) - } - case .whitespace: - matched = c.properties.isWhitespace && (c.isASCII || !isStrictAscii) - case .word: - matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !isStrictAscii) - } - - if matched { - currentPosition = next - return true - } else { - signalFailure() - return false - } - } - - mutating func regexAssert( - by kind: AST.Atom.AssertionKind, - _ anchorsMatchNewlines: Bool, - _ usesSimpleUnicodeBoundaries: Bool, - _ usesASCIIWord: Bool, - _ semanticLevel: MatchingOptions.SemanticLevel - ) throws -> Bool { - // Future work: Optimize layout and dispatch - - // FIXME: Depends on API model we have... We may want to - // think through some of these with API interactions in mind - // - // This might break how we use `bounds` for both slicing - // and things like `firstIndex`, that is `firstIndex` may - // need to supply both a slice bounds and a per-search bounds. - switch kind { - case .startOfSubject: return currentPosition == subjectBounds.lowerBound - - case .endOfSubjectBeforeNewline: - if currentPosition == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input.index(after: currentPosition) == subjectBounds.upperBound - && input[currentPosition].isNewline - case .unicodeScalar: - return input.unicodeScalars.index(after: currentPosition) == subjectBounds.upperBound - && input.unicodeScalars[currentPosition].isNewline - } - - case .endOfSubject: return currentPosition == subjectBounds.upperBound - - case .resetStartOfMatch: - // FIXME: Figure out how to communicate this out - throw Unsupported(#"\K (reset/keep assertion)"#) - - case .firstMatchingPositionInSubject: - // TODO: We can probably build a nice model with API here - - // FIXME: This needs to be based on `searchBounds`, - // not the `subjectBounds` given as an argument here - // (Note: the above fixme was in reference to the old assert function API. - // Now that we're in processor, we have access to searchBounds) - return false - - case .textSegment: return input.isOnGraphemeClusterBoundary(currentPosition) - - case .notTextSegment: return !input.isOnGraphemeClusterBoundary(currentPosition) - - case .startOfLine: - // FIXME: Anchor.startOfLine must always use this first branch - // The behavior of `^` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.startOfLine` anchor should always match the start - // of a line. Right now we don't distinguish between those anchors. - if anchorsMatchNewlines { - if currentPosition == subjectBounds.lowerBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[input.index(before: currentPosition)].isNewline - case .unicodeScalar: - return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline - } - } else { - return currentPosition == subjectBounds.lowerBound - } - - case .endOfLine: - // FIXME: Anchor.endOfLine must always use this first branch - // The behavior of `$` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.endOfLine` anchor should always match the end - // of a line. Right now we don't distinguish between those anchors. - if anchorsMatchNewlines { - if currentPosition == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[currentPosition].isNewline - case .unicodeScalar: - return input.unicodeScalars[currentPosition].isNewline - } - } else { - return currentPosition == subjectBounds.upperBound - } - - case .wordBoundary: - if usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? - return atSimpleBoundary(usesASCIIWord, semanticLevel) - } else { - return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) - } - - case .notWordBoundary: - if usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? - return !atSimpleBoundary(usesASCIIWord, semanticLevel) - } else { - return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) - } - } - } mutating func signalFailure() { guard let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = @@ -615,19 +428,9 @@ extension Processor { controller.step() case .assertBy: - let (kind, - anchorsMatchNewlines, - usesSimpleUnicodeBoundaries, - usesASCIIWord, - semanticLevel) = payload.assertion + let payload = payload.assertion do { - guard try regexAssert( - by: kind, - anchorsMatchNewlines, - usesSimpleUnicodeBoundaries, - usesASCIIWord, - semanticLevel - ) else { + guard try regexAssert(by: payload) else { signalFailure() return } From 0466c25423115eb1546b3dfde7b4d1567b17b9c3 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 12 Jul 2022 12:29:52 -0700 Subject: [PATCH 09/22] Cleanup assertions --- Sources/_StringProcessing/ByteCodeGen.swift | 3 +++ .../_StringProcessing/Engine/MEBuiltins.swift | 20 +++---------------- .../_StringProcessing/Engine/Processor.swift | 2 +- 3 files changed, 7 insertions(+), 18 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 8e7aad6cd..c40ca2066 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -121,6 +121,9 @@ fileprivate extension Compiler.ByteCodeGen { mutating func emitAssertion( _ kind: AST.Atom.AssertionKind ) throws { + if kind == .resetStartOfMatch { + throw Unsupported(#"\K (reset/keep assertion)"#) + } builder.buildAssert( by: kind, options.anchorsMatchNewlines, diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 8d7989a50..f79e8f463 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -87,15 +87,8 @@ extension Processor { } } - mutating func regexAssert(by payload: AssertionPayload) throws -> Bool { + mutating func builtinAssert(by payload: AssertionPayload) throws -> Bool { // Future work: Optimize layout and dispatch - - // FIXME: Depends on API model we have... We may want to - // think through some of these with API interactions in mind - // - // This might break how we use `bounds` for both slicing - // and things like `firstIndex`, that is `firstIndex` may - // need to supply both a slice bounds and a per-search bounds. switch payload.kind { case .startOfSubject: return currentPosition == subjectBounds.lowerBound @@ -113,17 +106,10 @@ extension Processor { case .endOfSubject: return currentPosition == subjectBounds.upperBound case .resetStartOfMatch: - // FIXME: Figure out how to communicate this out - throw Unsupported(#"\K (reset/keep assertion)"#) + fatalError("Unreachable, we should have thrown an error during compilation") case .firstMatchingPositionInSubject: - // TODO: We can probably build a nice model with API here - - // FIXME: This needs to be based on `searchBounds`, - // not the `subjectBounds` given as an argument here - // (Note: the above fixme was in reference to the old assert function API. - // Now that we're in processor, we have access to searchBounds) - return false + return currentPosition == searchBounds.lowerBound case .textSegment: return input.isOnGraphemeClusterBoundary(currentPosition) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index b5bf804de..4841d03e8 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -430,7 +430,7 @@ extension Processor { case .assertBy: let payload = payload.assertion do { - guard try regexAssert(by: payload) else { + guard try builtinAssert(by: payload) else { signalFailure() return } From f401e84e550d613eaaafa8c6c557d666fe0fd71a Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 12 Jul 2022 17:35:20 -0700 Subject: [PATCH 10/22] Fix tests --- Tests/RegexTests/CompileTests.swift | 154 ++++++++++++++-------------- Tests/RegexTests/MatchTests.swift | 11 +- 2 files changed, 79 insertions(+), 86 deletions(-) diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 6c8f66e10..a7c0ee531 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -37,6 +37,7 @@ enum DecodedInstr { case matchScalarUnchecked case matchBitsetScalar case matchBitset + case matchBuiltin case consumeBy case assertBy case matchBy @@ -45,8 +46,6 @@ enum DecodedInstr { case endCapture case transformCapture case captureValue - case builtinAssertion - case builtinCharacterClass } extension DecodedInstr { @@ -55,87 +54,84 @@ extension DecodedInstr { /// /// Must stay in sync with Processor.cycle static func decode(_ instruction: Instruction) -> DecodedInstr { - let (opcode, payload) = instruction.destructure - - switch opcode { - case .invalid: - fatalError("Invalid program") - case .moveImmediate: - return .moveImmediate - case .moveCurrentPosition: - return .moveCurrentPosition - case .branch: - return .branch - case .condBranchZeroElseDecrement: - return .condBranchZeroElseDecrement - case .condBranchSamePosition: - return .condBranchSamePosition - case .save: - return .save - case .saveAddress: - return .saveAddress - case .splitSaving: - return .splitSaving - case .clear: - return .clear - case .clearThrough: - return .clearThrough - case .accept: - return .accept - case .fail: - return .fail - case .advance: - return .advance - case .match: - let (isCaseInsensitive, _) = payload.elementPayload - if isCaseInsensitive { - return .matchCaseInsensitive - } else { - return .match - } - case .matchScalar: - let (_, caseInsensitive, boundaryCheck) = payload.scalarPayload - if caseInsensitive { - if boundaryCheck { - return .matchScalarCaseInsensitive - } else { - return .matchScalarCaseInsensitiveUnchecked - } + let (opcode, payload) = instruction.destructure + switch opcode { + case .invalid: + fatalError("Invalid program") + case .moveImmediate: + return .moveImmediate + case .moveCurrentPosition: + return .moveCurrentPosition + case .branch: + return .branch + case .condBranchZeroElseDecrement: + return .condBranchZeroElseDecrement + case .condBranchSamePosition: + return .condBranchSamePosition + case .save: + return .save + case .saveAddress: + return .saveAddress + case .splitSaving: + return .splitSaving + case .clear: + return .clear + case .clearThrough: + return .clearThrough + case .accept: + return .accept + case .fail: + return .fail + case .advance: + return .advance + case .match: + let (isCaseInsensitive, _) = payload.elementPayload + if isCaseInsensitive { + return .matchCaseInsensitive + } else { + return .match + } + case .matchScalar: + let (_, caseInsensitive, boundaryCheck) = payload.scalarPayload + if caseInsensitive { + if boundaryCheck { + return .matchScalarCaseInsensitive } else { - if boundaryCheck { - return .matchScalar - } else { - return .matchScalarUnchecked - } + return .matchScalarCaseInsensitiveUnchecked } - case .matchBitset: - let (isScalar, _) = payload.bitsetPayload - if isScalar { - return .matchBitsetScalar + } else { + if boundaryCheck { + return .matchScalar } else { - return .matchBitset + return .matchScalarUnchecked } - case .consumeBy: - return consumeBy - case .assertBy: - return .assertBy - case .matchBy: - return .matchBy - case .backreference: - return .backreference - case .beginCapture: - return .beginCapture - case .endCapture: - return .endCapture - case .transformCapture: - return .transformCapture - case .captureValue: - return .captureValue - case .builtinAssertion: - return .builtinAssertion - case .builtinCharacterClass: - return .builtinCharacterClass -} + } + case .matchBitset: + let (isScalar, _) = payload.bitsetPayload + if isScalar { + return .matchBitsetScalar + } else { + return .matchBitset + } + case .consumeBy: + return consumeBy + case .assertBy: + return .assertBy + case .matchBy: + return .matchBy + case .backreference: + return .backreference + case .beginCapture: + return .beginCapture + case .endCapture: + return .endCapture + case .transformCapture: + return .transformCapture + case .captureValue: + return .captureValue + case .matchBuiltin: + return .matchBuiltin + } } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index f2715eac1..377d5a7be 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1072,13 +1072,10 @@ extension RegexTests { ("123 456", "23")) #endif - // TODO: \G and \K - do { - let regex = try Regex(#"\Gab"#, as: Substring.self) - XCTExpectFailure { - XCTAssertEqual("abab".matches(of: regex).map(\.output), ["ab", "ab"]) - } - } + // \G and \K + let regex = try Regex(#"\Gab"#, as: Substring.self) + XCTAssertEqual("abab".matches(of: regex).map(\.output), ["ab", "ab"]) + // TODO: Oniguruma \y and \Y firstMatchTests( From b09f45fc8e624674307adcc04023ce16c2c26e4f Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Tue, 12 Jul 2022 17:37:55 -0700 Subject: [PATCH 11/22] Update opcode description for assertBy --- Sources/_StringProcessing/Engine/Instruction.swift | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index b0be7d4fd..ac63dc7f5 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -123,16 +123,12 @@ extension Instruction { /// Operand: Consume function register to call. case consumeBy - /// Custom lookaround assertion operation. - /// Triggers a failure if customFunction returns false. + /// Lookaround assertion operation. Performs a zero width assertion based on + /// the assertion type and options stored in the payload /// - /// assert(_ customFunction: ( - /// input: Input, - /// currentPos: Position, - /// bounds: Range - /// ) -> Bool) + /// assert(_:AssertionPayload) /// - /// Operands: destination bool register, assert hook register + /// Operands: AssertionPayload containing assertion type and options case assertBy /// Custom value-creating consume operation. From fb1576a7b99fd981d2af4d99c0362a7c94d30761 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Fri, 15 Jul 2022 14:18:14 -0700 Subject: [PATCH 12/22] Update branch to match main --- Sources/_StringProcessing/ByteCodeGen.swift | 26 ---- .../_StringProcessing/Engine/MEBuilder.swift | 2 +- .../_StringProcessing/Engine/MEBuiltins.swift | 129 +++++++----------- Sources/_StringProcessing/Regex/DSLTree.swift | 4 +- .../_CharacterClassModel.swift | 1 + 5 files changed, 57 insertions(+), 105 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 5ef71824a..276c80fe2 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -156,32 +156,6 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitStartOfLine() { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.lowerBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[input.index(before: pos)].isNewline - case .unicodeScalar: - return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline - } - } - } - - mutating func emitEndOfLine() { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[pos].isNewline - case .unicodeScalar: - return input.unicodeScalars[pos].isNewline - } - } - } - mutating func emitAssertion( _ kind: DSLTree.Atom.Assertion ) throws { diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index b0735c160..44015e87e 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -188,7 +188,7 @@ extension MEProgram.Builder { } mutating func buildAssert( - by kind: AST.Atom.AssertionKind, + by kind: DSLTree.Atom.Assertion, _ anchorsMatchNewlines: Bool, _ usesSimpleUnicodeBoundaries: Bool, _ usesASCIIWord: Bool, diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index f79e8f463..af42fe9de 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -86,6 +86,26 @@ extension Processor { return false } } + + func isAtStartOfLine(_ payload: AssertionPayload) -> Bool { + if currentPosition == subjectBounds.lowerBound { return true } + switch payload.semanticLevel { + case .graphemeCluster: + return input[input.index(before: currentPosition)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline + } + } + + func isAtEndOfLine(_ payload: AssertionPayload) -> Bool { + if currentPosition == subjectBounds.upperBound { return true } + switch payload.semanticLevel { + case .graphemeCluster: + return input[currentPosition].isNewline + case .unicodeScalar: + return input.unicodeScalars[currentPosition].isNewline + } + } mutating func builtinAssert(by payload: AssertionPayload) throws -> Bool { // Future work: Optimize layout and dispatch @@ -116,54 +136,39 @@ extension Processor { case .notTextSegment: return !input.isOnGraphemeClusterBoundary(currentPosition) case .startOfLine: - // FIXME: Anchor.startOfLine must always use this first branch - // The behavior of `^` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.startOfLine` anchor should always match the start - // of a line. Right now we don't distinguish between those anchors. + return isAtStartOfLine(payload) + case .endOfLine: + return isAtEndOfLine(payload) + + case .caretAnchor: if payload.anchorsMatchNewlines { - if currentPosition == subjectBounds.lowerBound { return true } - switch payload.semanticLevel { - case .graphemeCluster: - return input[input.index(before: currentPosition)].isNewline - case .unicodeScalar: - return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline - } + return isAtStartOfLine(payload) } else { return currentPosition == subjectBounds.lowerBound } - - case .endOfLine: - // FIXME: Anchor.endOfLine must always use this first branch - // The behavior of `$` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.endOfLine` anchor should always match the end - // of a line. Right now we don't distinguish between those anchors. - if payload.anchorsMatchNewlines { - if currentPosition == subjectBounds.upperBound { return true } - switch payload.semanticLevel { - case .graphemeCluster: - return input[currentPosition].isNewline - case .unicodeScalar: - return input.unicodeScalars[currentPosition].isNewline - } - } else { - return currentPosition == subjectBounds.upperBound - } - - case .wordBoundary: - if payload.usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? - return atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) - } else { - return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) - } - - case .notWordBoundary: - if payload.usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? - return !atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) - } else { - return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) - } + + case .dollarAnchor: + if payload.anchorsMatchNewlines { + return isAtEndOfLine(payload) + } else { + return currentPosition == subjectBounds.upperBound + } + + case .wordBoundary: + if payload.usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) + } else { + return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + } + + case .notWordBoundary: + if payload.usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return !atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) + } else { + return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + } } } } @@ -179,7 +184,7 @@ struct AssertionPayload: RawRepresentable { assert(rawValue & _opcodeMask == 0) } - init(_ assertion: AST.Atom.AssertionKind, + init(_ assertion: DSLTree.Atom.Assertion, _ anchorsMatchNewlines: Bool, _ usesSimpleUnicodeBoundaries: Bool, _ usesASCIIWord: Bool, @@ -194,40 +199,12 @@ struct AssertionPayload: RawRepresentable { // 4 bits for the assertion kind // Future work: Optimize this layout - let kind: UInt64 - switch assertion { - case .endOfLine: kind = 0 - case .endOfSubject: kind = 1 - case .endOfSubjectBeforeNewline: kind = 2 - case .firstMatchingPositionInSubject: kind = 3 - case .notTextSegment: kind = 4 - case .notWordBoundary: kind = 5 - case .resetStartOfMatch: kind = 6 - case .startOfLine: kind = 7 - case .startOfSubject: kind = 8 - case .textSegment: kind = 9 - case .wordBoundary: kind = 10 - } + let kind = assertion.rawValue self.init(rawValue: kind + optionsBits) } - var kind: AST.Atom.AssertionKind { - let kind: AST.Atom.AssertionKind - switch self.rawValue & _assertionKindMask { - case 0: kind = .endOfLine - case 1: kind = .endOfSubject - case 2: kind = .endOfSubjectBeforeNewline - case 3: kind = .firstMatchingPositionInSubject - case 4: kind = .notTextSegment - case 5: kind = .notWordBoundary - case 6: kind = .resetStartOfMatch - case 7: kind = .startOfLine - case 8: kind = .startOfSubject - case 9: kind = .textSegment - case 10: kind = .wordBoundary - default: fatalError("Unreachable") - } - return kind + var kind: DSLTree.Atom.Assertion { + return .init(rawValue: self.rawValue & _assertionKindMask)! } var anchorsMatchNewlines: Bool { (self.rawValue >> 55) & 1 == 1 } var usesSimpleUnicodeBoundaries: Bool { (self.rawValue >> 54) & 1 == 1 } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 4ea905fd5..a98bd8441 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -189,9 +189,9 @@ extension DSLTree { extension DSLTree.Atom { @_spi(RegexBuilder) - public enum Assertion: Hashable { + public enum Assertion: UInt64, Hashable { /// \A - case startOfSubject + case startOfSubject = 0 /// \Z case endOfSubjectBeforeNewline diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 003d5037a..f32e74693 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -316,6 +316,7 @@ extension BuiltinCC { case .hexDigit: return options.usesASCIIDigits case .horizontalWhitespace: return options.usesASCIISpaces case .newlineSequence: return options.usesASCIISpaces + case .verticalWhitespace: return options.usesASCIISpaces case .whitespace: return options.usesASCIISpaces case .word: return options.usesASCIIWord default: return false From 3b9485efd270038a4aee5d61cc8a2a851390b213 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Fri, 15 Jul 2022 17:35:34 -0700 Subject: [PATCH 13/22] Use the newly cleaned up _CharacterClassModel --- Sources/RegexBuilder/CharacterClass.swift | 25 +++--- Sources/_StringProcessing/ByteCodeGen.swift | 11 ++- .../_StringProcessing/ConsumerInterface.swift | 1 - .../Engine/InstPayload.swift | 53 +++++++++---- .../_StringProcessing/Engine/MEBuilder.swift | 4 +- .../_StringProcessing/Engine/MEBuiltins.swift | 18 +++-- .../_StringProcessing/Engine/Processor.swift | 8 +- .../Utility/RegexFactory.swift | 8 ++ .../_CharacterClassModel.swift | 79 +++++-------------- 9 files changed, 105 insertions(+), 102 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index ea52c28f3..289a8c66b 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -15,20 +15,27 @@ @available(SwiftStdlib 5.7, *) public struct CharacterClass { internal var ccc: DSLTree.CustomCharacterClass + internal var builtin: DSLTree._AST.Atom? // lily note: This seems illegal init(_ ccc: DSLTree.CustomCharacterClass) { self.ccc = ccc + self.builtin = nil } - init(unconverted atom: DSLTree._AST.Atom) { + init(builtin atom: DSLTree._AST.Atom) { self.ccc = .init(members: [.atom(.unconverted(atom))]) + self.builtin = atom } } @available(SwiftStdlib 5.7, *) extension CharacterClass: RegexComponent { public var regex: Regex { - _RegexFactory().customCharacterClass(ccc) + if let unconverted = builtin { + return _RegexFactory().unconverted(unconverted) + } else { + return _RegexFactory().customCharacterClass(ccc) + } } } @@ -50,15 +57,15 @@ extension RegexComponent where Self == CharacterClass { } public static var anyGraphemeCluster: CharacterClass { - .init(unconverted: ._anyGrapheme) + .init(builtin: ._anyGrapheme) } public static var whitespace: CharacterClass { - .init(unconverted: ._whitespace) + .init(builtin: ._whitespace) } public static var digit: CharacterClass { - .init(unconverted: ._digit) + .init(builtin: ._digit) } public static var hexDigit: CharacterClass { @@ -70,19 +77,19 @@ extension RegexComponent where Self == CharacterClass { } public static var horizontalWhitespace: CharacterClass { - .init(unconverted: ._horizontalWhitespace) + .init(builtin: ._horizontalWhitespace) } public static var newlineSequence: CharacterClass { - .init(unconverted: ._newlineSequence) + .init(builtin: ._newlineSequence) } public static var verticalWhitespace: CharacterClass { - .init(unconverted: ._verticalWhitespace) + .init(builtin: ._verticalWhitespace) } public static var word: CharacterClass { - .init(unconverted: ._word) + .init(builtin: ._word) } } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 276c80fe2..88ec2de0a 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -90,11 +90,10 @@ fileprivate extension Compiler.ByteCodeGen { options.apply(optionSequence.ast) case let .unconverted(astAtom): - if optimizationsEnabled, - let cc = astAtom.ast.characterClass?.builtinCC { + if let cc = astAtom.ast.characterClass { builder.buildMatchBuiltin( cc, - cc.isStrict(options: options), + cc.isStrictAscii(options: options), isScalar: options.semanticLevel == .unicodeScalar) return } @@ -666,10 +665,10 @@ fileprivate extension Compiler.ByteCodeGen { } else { builder.buildMatchAsciiBitset(asciiBitset) } - } else { - let consumer = try ccc.generateConsumer(options) - builder.buildConsume(by: consumer) + return } + let consumer = try ccc.generateConsumer(options) + builder.buildConsume(by: consumer) } @discardableResult diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 668d16eb6..dbbaf314b 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -269,7 +269,6 @@ extension AST.Atom { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { - // TODO: Wean ourselves off of this type... if let cc = self.characterClass?.withMatchLevel( opts.matchLevel ) { diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 96c307c75..89b0d410b 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -224,20 +224,11 @@ extension Instruction.Payload { return (isScalar: pair.0 == 1, pair.1) } - init(_ cc: BuiltinCC, _ isStrict: Bool, _ isScalar: Bool) { - let strictBit = isStrict ? 1 << 15 : 0 - let scalarBit = isScalar ? 1 << 14 : 0 - // val must be 16 bits, reserve the top 2 bits for if it is strict ascii or scalar - assert(cc.rawValue <= 0x3F_FF) - let val = cc.rawValue + UInt64(strictBit) + UInt64(scalarBit) - self.init(val) - } - var builtinCCPayload: (cc: BuiltinCC, isStrict: Bool, isScalar: Bool) { - let val = self.rawValue - let cc = BuiltinCC(rawValue: val & 0x3F_FF)! - let isStrict = (val >> 15) & 1 == 1 - let isScalar = (val >> 14) & 1 == 1 - return (cc, isStrict, isScalar) + init(_ cc: _CharacterClassModel.Representation, _ isInverted: Bool, _ isStrict: Bool, _ isScalar: Bool) { + self.init(CharacterClassPayload(cc, isInverted, isStrict, isScalar).rawValue) + } + var characterClassPayload: CharacterClassPayload{ + return CharacterClassPayload(rawValue: rawValue & _payloadMask) } init(consumer: ConsumeFunctionRegister) { @@ -355,3 +346,37 @@ extension Instruction.Payload { } } +struct CharacterClassPayload: RawRepresentable { + let rawValue: UInt64 + // Layout: + // Top three bits are isInverted, isStrict, isScalar + // Lower 16 bits are _CCM.Representation + static let invertedShift: UInt64 = 55 + static let strictShift: UInt64 = 54 + static let scalarShift: UInt64 = 53 + static let ccMask: UInt64 = 0xFF + init(rawValue: UInt64) { + assert(rawValue & _opcodeMask == 0) + self.rawValue = rawValue + } + init(_ cc: _CharacterClassModel.Representation, _ isInverted: Bool, _ isStrict: Bool, _ isScalar: Bool) { + let invertedBit = isInverted ? 1 << CharacterClassPayload.invertedShift : 0 + let strictBit = isStrict ? 1 << CharacterClassPayload.strictShift : 0 + let scalarBit = isScalar ? 1 << CharacterClassPayload.scalarShift : 0 + assert(cc.rawValue <= CharacterClassPayload.ccMask) // + self.init(rawValue: cc.rawValue + UInt64(invertedBit) + UInt64(strictBit) + UInt64(scalarBit)) + } + + var isInverted: Bool { + (self.rawValue >> CharacterClassPayload.invertedShift) & 1 == 1 + } + var isStrict: Bool { + (self.rawValue >> CharacterClassPayload.strictShift) & 1 == 1 + } + var isScalar: Bool { + (self.rawValue >> CharacterClassPayload.scalarShift) & 1 == 1 + } + var cc: _CharacterClassModel.Representation { + _CharacterClassModel.Representation.init(rawValue: self.rawValue & CharacterClassPayload.ccMask)! + } +} diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 44015e87e..421ed5da3 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -172,12 +172,12 @@ extension MEProgram.Builder { } mutating func buildMatchBuiltin( - _ cc: BuiltinCC, + _ cc: _CharacterClassModel, _ isStrict: Bool, isScalar: Bool ) { instructions.append(.init( - .matchBuiltin, .init(cc, isStrict, isScalar))) + .matchBuiltin, .init(cc.cc, cc.isInverted, isStrict, isScalar))) } mutating func buildConsume( diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index af42fe9de..83c2e947c 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -2,12 +2,13 @@ extension Processor { mutating func matchBuiltin( - _ cc: BuiltinCC, + _ cc: _CharacterClassModel.Representation, + _ isInverted: Bool, _ isStrictAscii: Bool ) -> Bool { guard let c = load() else { signalFailure() - return false + return isInverted } var matched: Bool @@ -32,7 +33,9 @@ extension Processor { case .word: matched = c.isWordCharacter && (c.isASCII || !isStrictAscii) } - + if isInverted { + matched.toggle() + } if matched { currentPosition = next return true @@ -43,12 +46,13 @@ extension Processor { } mutating func matchBuiltinScalar( - _ cc: BuiltinCC, + _ cc: _CharacterClassModel.Representation, + _ isInverted: Bool, _ isStrictAscii: Bool ) -> Bool { guard let c = loadScalar() else { signalFailure() - return false + return isInverted } var matched: Bool @@ -77,7 +81,9 @@ extension Processor { case .word: matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !isStrictAscii) } - + if isInverted { + matched.toggle() + } if matched { currentPosition = next return true diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 5f56ca881..0907164d8 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -478,13 +478,13 @@ extension Processor { } case .matchBuiltin: - let (cc, isStrict, isScalar) = payload.builtinCCPayload - if isScalar { - if matchBuiltinScalar(cc, isStrict) { + let payload = payload.characterClassPayload + if payload.isScalar { + if matchBuiltinScalar(payload.cc, payload.isInverted, payload.isStrict) { controller.step() } } else { - if matchBuiltin(cc, isStrict) { + if matchBuiltin(payload.cc, payload.isInverted, payload.isStrict) { controller.step() } } diff --git a/Sources/_StringProcessing/Utility/RegexFactory.swift b/Sources/_StringProcessing/Utility/RegexFactory.swift index 31245c0f7..3c2e13a3e 100644 --- a/Sources/_StringProcessing/Utility/RegexFactory.swift +++ b/Sources/_StringProcessing/Utility/RegexFactory.swift @@ -58,6 +58,14 @@ public struct _RegexFactory { ) -> Regex { .init(node: .atom(.scalar(scalar))) } + + @_spi(RegexBuilder) + @available(SwiftStdlib 5.7, *) + public func unconverted( + _ atom: DSLTree._AST.Atom + ) -> Regex { + .init(node: .atom(.unconverted(atom))) + } @_spi(RegexBuilder) @available(SwiftStdlib 5.7, *) diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index f32e74693..282ba1eb2 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -27,9 +27,9 @@ struct _CharacterClassModel: Hashable { var isInverted: Bool = false // TODO: Split out builtin character classes into their own type? - enum Representation: Hashable { + enum Representation: UInt64, Hashable { /// Any character - case any + case any = 0 /// Any grapheme cluster case anyGrapheme /// Any Unicode scalar @@ -70,6 +70,20 @@ struct _CharacterClassModel: Hashable { return result } + /// Returns true if this CharacterClass should be matched by strict ascii under the given options + func isStrictAscii(options: MatchingOptions) -> Bool { + switch self { + case .digit: return options.usesASCIIDigits + case .hexDigit: return options.usesASCIIDigits + case .horizontalWhitespace: return options.usesASCIISpaces + case .newlineSequence: return options.usesASCIISpaces + case .verticalWhitespace: return options.usesASCIISpaces + case .whitespace: return options.usesASCIISpaces + case .word: return options.usesASCIIWord + default: return false + } + } + /// Conditionally inverts a character class. /// /// - Parameter inversion: Indicates whether to invert the character class. @@ -95,6 +109,9 @@ struct _CharacterClassModel: Hashable { /// - Parameter options: Options for the match operation. /// - Returns: The index of the end of the match, or `nil` if there is no match. func matches(in str: String, at i: String.Index, with options: MatchingOptions) -> String.Index? { + // FIXME: This is only called in custom character classes that contain builtin + // character classes as members (ie: [a\w] or set operations), is there + // any way to avoid that? Can we remove this somehow? switch matchLevel { case .graphemeCluster: let c = str[i] @@ -295,61 +312,3 @@ extension AST.Atom.EscapedBuiltin { } } } - -internal enum BuiltinCC: UInt64 { - case any = 1 - case anyGrapheme - case anyScalar - case digit - case hexDigit - case horizontalWhitespace - case newlineSequence - case verticalWhitespace - case whitespace - case word -} - -extension BuiltinCC { - func isStrict(options: MatchingOptions) -> Bool { - switch self { - case .digit: return options.usesASCIIDigits - case .hexDigit: return options.usesASCIIDigits - case .horizontalWhitespace: return options.usesASCIISpaces - case .newlineSequence: return options.usesASCIISpaces - case .verticalWhitespace: return options.usesASCIISpaces - case .whitespace: return options.usesASCIISpaces - case .word: return options.usesASCIIWord - default: return false - } - } -} - -extension _CharacterClassModel { - internal var builtinCC: BuiltinCC? { - // Future work: Make CCM always either a BuiltinCC or convertable to a - // custom character class - if isInverted { return nil } - switch self.cc { - case .any: - return .any - case .anyGrapheme: - return .anyGrapheme - case .anyScalar: - return .anyScalar - case .digit: - return .digit - case .hexDigit: - return .hexDigit - case .horizontalWhitespace: - return .horizontalWhitespace - case .newlineSequence: - return .newlineSequence - case .verticalWhitespace: - return .verticalWhitespace - case .whitespace: - return .whitespace - case .word: - return .word - } - } -} From 64d1ed9d230975967762952ef9cb9dd636bd25da Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Fri, 15 Jul 2022 18:56:24 -0700 Subject: [PATCH 14/22] Add characterClass DSLTree node --- Sources/RegexBuilder/CharacterClass.swift | 37 +++++---- Sources/_StringProcessing/ByteCodeGen.swift | 19 +++-- .../_StringProcessing/ConsumerInterface.swift | 26 ++++--- .../_StringProcessing/Engine/MEBuiltins.swift | 4 - .../_StringProcessing/PrintAsPattern.swift | 39 ++++++++++ .../Regex/ASTConversion.swift | 37 ++++++++- Sources/_StringProcessing/Regex/DSLTree.swift | 69 +++++++++------- .../Utility/RegexFactory.swift | 6 +- .../_CharacterClassModel.swift | 78 ++++--------------- 9 files changed, 181 insertions(+), 134 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index 289a8c66b..4cb0a5e42 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -15,24 +15,25 @@ @available(SwiftStdlib 5.7, *) public struct CharacterClass { internal var ccc: DSLTree.CustomCharacterClass - internal var builtin: DSLTree._AST.Atom? // lily note: This seems illegal + /// The builtin character class, if this CharacterClass is representable by one + internal var builtin: DSLTree.Atom.CharacterClass? init(_ ccc: DSLTree.CustomCharacterClass) { self.ccc = ccc self.builtin = nil } - init(builtin atom: DSLTree._AST.Atom) { - self.ccc = .init(members: [.atom(.unconverted(atom))]) - self.builtin = atom + init(builtin: DSLTree.Atom.CharacterClass) { + self.ccc = .init(members: [.atom(.characterClass(builtin))]) + self.builtin = builtin } } @available(SwiftStdlib 5.7, *) extension CharacterClass: RegexComponent { public var regex: Regex { - if let unconverted = builtin { - return _RegexFactory().unconverted(unconverted) + if let cc = builtin { + return _RegexFactory().characterClass(cc) } else { return _RegexFactory().customCharacterClass(ccc) } @@ -42,7 +43,15 @@ extension CharacterClass: RegexComponent { @available(SwiftStdlib 5.7, *) extension CharacterClass { public var inverted: CharacterClass { - CharacterClass(ccc.inverted) + return CharacterClass(ccc.inverted) + // lily fixme: this causes a precondition to fail in Capture.swift... why? + // why are the inverted builtins causing issues? + // Match tests are all passing +// if let inv = builtin?.inverted { +// return CharacterClass(builtin: inv) +// } else { +// return CharacterClass(ccc.inverted) +// } } } @@ -57,15 +66,15 @@ extension RegexComponent where Self == CharacterClass { } public static var anyGraphemeCluster: CharacterClass { - .init(builtin: ._anyGrapheme) + .init(builtin: .anyGrapheme) } public static var whitespace: CharacterClass { - .init(builtin: ._whitespace) + .init(builtin: .whitespace) } public static var digit: CharacterClass { - .init(builtin: ._digit) + .init(builtin: .digit) } public static var hexDigit: CharacterClass { @@ -77,19 +86,19 @@ extension RegexComponent where Self == CharacterClass { } public static var horizontalWhitespace: CharacterClass { - .init(builtin: ._horizontalWhitespace) + .init(builtin: .horizontalWhitespace) } public static var newlineSequence: CharacterClass { - .init(builtin: ._newlineSequence) + .init(builtin: .newlineSequence) } public static var verticalWhitespace: CharacterClass { - .init(builtin: ._verticalWhitespace) + .init(builtin: .verticalWhitespace) } public static var word: CharacterClass { - .init(builtin: ._word) + .init(builtin: .word) } } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 88ec2de0a..d6277d5c9 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -74,6 +74,9 @@ fileprivate extension Compiler.ByteCodeGen { emitMatchScalar(s) } + case let .characterClass(cc): + emitCharacterClass(cc) + case let .assertion(kind): try emitAssertion(kind) @@ -90,13 +93,6 @@ fileprivate extension Compiler.ByteCodeGen { options.apply(optionSequence.ast) case let .unconverted(astAtom): - if let cc = astAtom.ast.characterClass { - builder.buildMatchBuiltin( - cc, - cc.isStrictAscii(options: options), - isScalar: options.semanticLevel == .unicodeScalar) - return - } if let consumer = try astAtom.ast.generateConsumer(options) { builder.buildConsume(by: consumer) } else { @@ -168,7 +164,14 @@ fileprivate extension Compiler.ByteCodeGen { options.usesASCIIWord, options.semanticLevel) } - + + mutating func emitCharacterClass(_ cc: DSLTree.Atom.CharacterClass) { + builder.buildMatchBuiltin( + cc.model, + cc.model.isStrictAscii(options: options), + isScalar: options.semanticLevel == .unicodeScalar) + } + mutating func emitMatchScalar(_ s: UnicodeScalar) { assert(options.semanticLevel == .unicodeScalar) if options.isCaseInsensitive && s.properties.isCased { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index dbbaf314b..b37b9341a 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -162,6 +162,8 @@ extension DSLTree.Atom { case .assertion: // TODO: We could handle, should this be total? return nil + case .characterClass(let cc): + return cc.generateConsumer(opts) case .backreference: // TODO: Should we handle? @@ -182,6 +184,16 @@ extension DSLTree.Atom { } } +extension DSLTree.Atom.CharacterClass { + func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction { + return { input, bounds in + // FIXME: should we worry about out of bounds? + model.withMatchLevel(opts.matchLevel) + .matches(in: input, at: bounds.lowerBound, with: opts) + } + } +} + extension String { /// Compares this string to `other` using the loose matching rule UAX44-LM2, /// which ignores case, whitespace, underscores, and nearly all medial @@ -269,15 +281,6 @@ extension AST.Atom { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { - if let cc = self.characterClass?.withMatchLevel( - opts.matchLevel - ) { - return { input, bounds in - // FIXME: should we worry about out of bounds? - cc.matches(in: input, at: bounds.lowerBound, with: opts) - } - } - switch kind { case let .scalar(s): assertionFailure( @@ -311,8 +314,11 @@ extension AST.Atom { case .caretAnchor, .dollarAnchor: // handled in emitAssertion return nil + case .escaped: + // handled in emitAssertion and emitCharacterClass + return nil - case .scalarSequence, .escaped, .keyboardControl, .keyboardMeta, + case .scalarSequence, .keyboardControl, .keyboardMeta, .keyboardMetaControl, .backreference, .subpattern, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: // FIXME: implement diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 83c2e947c..4f14b0a06 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -20,8 +20,6 @@ extension Processor { next = input.unicodeScalars.index(after: currentPosition) case .digit: matched = c.isNumber && (c.isASCII || !isStrictAscii) - case .hexDigit: - matched = c.isHexDigit && (c.isASCII || !isStrictAscii) case .horizontalWhitespace: matched = c.unicodeScalars.first?.isHorizontalWhitespace == true && (c.isASCII || !isStrictAscii) @@ -65,8 +63,6 @@ extension Processor { next = input.index(after: currentPosition) case .digit: matched = c.properties.numericType != nil && (c.isASCII || !isStrictAscii) - case .hexDigit: - matched = Character(c).isHexDigit && (c.isASCII || !isStrictAscii) case .horizontalWhitespace: matched = c.isHorizontalWhitespace && (c.isASCII || !isStrictAscii) case .verticalWhitespace: diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 80f2e7697..321c27747 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -684,6 +684,41 @@ extension DSLTree.Atom.Assertion { } } +extension DSLTree.Atom.CharacterClass { + var _patternBase: String { + switch self { + case .anyGrapheme: + return ".anyGraphemeCluster" + case .anyUnicodeScalar: + return ".anyUnicodeScalar" + case .digit: + return ".digit" + case .notDigit: + return ".digit.inverted" + case .word: + return ".word" + case .notWord: + return ".word.inverted" + case .horizontalWhitespace: + return ".horizontalWhitespace" + case .notHorizontalWhitespace: + return ".horizontalWhitespace.inverted" + case .newlineSequence: + return ".newlineSequence" + case .notNewline: + return ".newlineSequence.inverted" + case .verticalWhitespace: + return ".verticalWhitespace" + case .notVerticalWhitespace: + return ".vertialWhitespace.inverted" + case .whitespace: + return ".whitespace" + case .notWhitespace: + return ".whitespace.inverted" + } + } +} + extension AST.Atom.CharacterProperty { var isUnprintableProperty: Bool { switch kind { @@ -1156,6 +1191,8 @@ extension DSLTree.Atom { case .assertion(let a): return (a._patternBase, false) + case .characterClass(let cc): + return (cc._patternBase, true) case .backreference(_): return ("/* TOOD: backreferences */", false) @@ -1200,6 +1237,8 @@ extension DSLTree.Atom { case .assertion: return "/* TODO: assertions */" + case .characterClass: + return "/* TODO: character classes */" case .backreference: return "/* TOOD: backreferences */" case .symbolicReference: diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index c4ac8e759..1702b7761 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -223,6 +223,25 @@ extension AST.Atom.EscapedBuiltin { default: return nil } } + var dslCharacterClass: DSLTree.Atom.CharacterClass? { + switch self { + case .decimalDigit: return .digit + case .notDecimalDigit: return .notDigit + case .horizontalWhitespace: return .horizontalWhitespace + case .notHorizontalWhitespace: return .notHorizontalWhitespace + case .newlineSequence: return .newlineSequence + case .notNewline: return .notNewline + case .whitespace: return .whitespace + case .notWhitespace: return .notWhitespace + case .verticalTab: return .verticalWhitespace + case .notVerticalTab: return .notVerticalWhitespace + case .wordCharacter: return .word + case .notWordCharacter: return .notWord + case .graphemeCluster: return .anyGrapheme + case .trueAnychar: return .anyUnicodeScalar + default: return nil + } + } } extension AST.Atom { @@ -234,6 +253,12 @@ extension AST.Atom { default: return nil } } + var dslCharacterClass: DSLTree.Atom.CharacterClass? { + switch kind { + case .escaped(let b): return b.dslCharacterClass + default: return nil + } + } } extension AST.Atom { @@ -241,6 +266,10 @@ extension AST.Atom { if let kind = dslAssertionKind { return .assertion(kind) } + + if let cc = dslCharacterClass { + return .characterClass(cc) + } switch self.kind { case let .char(c): return .char(c) @@ -249,9 +278,11 @@ extension AST.Atom { case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) - case .escaped(let c) where c.scalarValue != nil: - return .scalar(c.scalarValue!) - + case .escaped(let c): + guard let val = c.scalarValue else { + fatalError("Got a .escaped that was not an assertion, character class, or scalar value \(self)") + } + return .scalar(val) default: return .unconverted(.init(ast: self)) } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index a98bd8441..b073511c1 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -177,6 +177,7 @@ extension DSLTree { /// newlines unless single line mode is enabled. case dot + case characterClass(CharacterClass) case assertion(Assertion) case backreference(_AST.Reference) case symbolicReference(ReferenceID) @@ -231,6 +232,46 @@ extension DSLTree.Atom { /// \B case notWordBoundary } + + @_spi(RegexBuilder) + public enum CharacterClass: Hashable { + case digit + case notDigit + case horizontalWhitespace + case notHorizontalWhitespace + case newlineSequence + case notNewline + case whitespace + case notWhitespace + case verticalWhitespace + case notVerticalWhitespace + case word + case notWord + case anyGrapheme + case anyUnicodeScalar + } +} + +extension DSLTree.Atom.CharacterClass { + @_spi(RegexBuilder) + public var inverted: DSLTree.Atom.CharacterClass? { + switch self { + case .anyGrapheme: return nil + case .anyUnicodeScalar: return nil + case .digit: return .notDigit + case .notDigit: return .digit + case .word: return .notWord + case .notWord: return .word + case .horizontalWhitespace: return .notHorizontalWhitespace + case .notHorizontalWhitespace: return .horizontalWhitespace + case .newlineSequence: return .notNewline + case .notNewline: return .newlineSequence + case .verticalWhitespace: return .notVerticalWhitespace + case .notVerticalWhitespace: return .verticalWhitespace + case .whitespace: return .notWhitespace + case .notWhitespace: return .whitespace + } + } } extension Unicode.GeneralCategory { @@ -759,34 +800,8 @@ extension DSLTree { internal var ast: AST.MatchingOptionSequence } - @_spi(RegexBuilder) public struct Atom { internal var ast: AST.Atom - - // FIXME: The below APIs should be removed once the DSL tree has been - // migrated to use proper DSL atoms for them. - - public static var _anyGrapheme: Self { - .init(ast: .init(.escaped(.graphemeCluster), .fake)) - } - public static var _whitespace: Self { - .init(ast: .init(.escaped(.whitespace), .fake)) - } - public static var _digit: Self { - .init(ast: .init(.escaped(.decimalDigit), .fake)) - } - public static var _horizontalWhitespace: Self { - .init(ast: .init(.escaped(.horizontalWhitespace), .fake)) - } - public static var _newlineSequence: Self { - .init(ast: .init(.escaped(.newlineSequence), .fake)) - } - public static var _verticalWhitespace: Self { - .init(ast: .init(.escaped(.verticalTab), .fake)) - } - public static var _word: Self { - .init(ast: .init(.escaped(.wordCharacter), .fake)) - } } } } @@ -800,7 +815,7 @@ extension DSLTree.Atom { case .changeMatchingOptions, .assertion: return false case .char, .scalar, .any, .anyNonNewline, .dot, .backreference, - .symbolicReference, .unconverted: + .symbolicReference, .unconverted, .characterClass: return true } } diff --git a/Sources/_StringProcessing/Utility/RegexFactory.swift b/Sources/_StringProcessing/Utility/RegexFactory.swift index 3c2e13a3e..e0df906fa 100644 --- a/Sources/_StringProcessing/Utility/RegexFactory.swift +++ b/Sources/_StringProcessing/Utility/RegexFactory.swift @@ -61,10 +61,10 @@ public struct _RegexFactory { @_spi(RegexBuilder) @available(SwiftStdlib 5.7, *) - public func unconverted( - _ atom: DSLTree._AST.Atom + public func characterClass( + _ cc: DSLTree.Atom.CharacterClass ) -> Regex { - .init(node: .atom(.unconverted(atom))) + .init(node: .atom(.characterClass(cc))) } @_spi(RegexBuilder) diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 282ba1eb2..2431b3f45 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -36,8 +36,6 @@ struct _CharacterClassModel: Hashable { case anyScalar /// Character.isDigit case digit - /// Character.isHexDigit - case hexDigit /// Horizontal whitespace: `[:blank:]`, i.e /// `[\p{gc=Space_Separator}\N{CHARACTER TABULATION}] case horizontalWhitespace @@ -74,7 +72,6 @@ struct _CharacterClassModel: Hashable { func isStrictAscii(options: MatchingOptions) -> Bool { switch self { case .digit: return options.usesASCIIDigits - case .hexDigit: return options.usesASCIIDigits case .horizontalWhitespace: return options.usesASCIISpaces case .newlineSequence: return options.usesASCIISpaces case .verticalWhitespace: return options.usesASCIISpaces @@ -84,22 +81,11 @@ struct _CharacterClassModel: Hashable { } } - /// Conditionally inverts a character class. - /// - /// - Parameter inversion: Indicates whether to invert the character class. - /// - Returns: The inverted character class if `inversion` is `true`; - /// otherwise, the same character class. - func withInversion(_ inversion: Bool) -> Self { - var copy = self - if inversion { - copy.isInverted.toggle() - } - return copy - } - /// Inverts a character class. var inverted: Self { - return withInversion(true) + var copy = self + copy.isInverted.toggle() + return copy } /// Returns the end of the match of this character class in the string. @@ -124,8 +110,6 @@ struct _CharacterClassModel: Hashable { next = str.unicodeScalars.index(after: i) case .digit: matched = c.isNumber && (c.isASCII || !options.usesASCIIDigits) - case .hexDigit: - matched = c.isHexDigit && (c.isASCII || !options.usesASCIIDigits) case .horizontalWhitespace: matched = c.unicodeScalars.first?.isHorizontalWhitespace == true && (c.isASCII || !options.usesASCIISpaces) @@ -153,8 +137,6 @@ struct _CharacterClassModel: Hashable { nextIndex = str.index(after: i) case .digit: matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits) - case .hexDigit: - matched = Character(c).isHexDigit && (c.isASCII || !options.usesASCIIDigits) case .horizontalWhitespace: matched = c.isHorizontalWhitespace && (c.isASCII || !options.usesASCIISpaces) case .verticalWhitespace: @@ -197,10 +179,6 @@ extension _CharacterClassModel { static var digit: _CharacterClassModel { .init(cc: .digit, matchLevel: .graphemeCluster) } - - static var hexDigit: _CharacterClassModel { - .init(cc: .hexDigit, matchLevel: .graphemeCluster) - } static var horizontalWhitespace: _CharacterClassModel { .init(cc: .horizontalWhitespace, matchLevel: .graphemeCluster) @@ -226,7 +204,6 @@ extension _CharacterClassModel.Representation: CustomStringConvertible { case .anyGrapheme: return "" case .anyScalar: return "" case .digit: return "" - case .hexDigit: return "" case .horizontalWhitespace: return "" case .newlineSequence: return "" case .verticalWhitespace: return "vertical whitespace" @@ -252,37 +229,11 @@ extension _CharacterClassModel { } } -extension AST.Atom { - var characterClass: _CharacterClassModel? { - switch kind { - case let .escaped(b): return b.characterClass - - case .property: - // TODO: Would our model type for character classes include - // this? Or does grapheme-semantic mode complicate that? - return nil - - case .dot: - // `.dot` is handled in the matching engine by Compiler.emitDot() and in - // the legacy compiler by the `.any` instruction, which can provide lower - // level instructions than the CharacterClass-generated consumer closure - // - // FIXME: We shouldn't be returning `nil` here, but instead fixing the call - // site to check for any before trying to construct a character class. - return nil - - default: return nil - - } - } - -} - -extension AST.Atom.EscapedBuiltin { - var characterClass: _CharacterClassModel? { +extension DSLTree.Atom.CharacterClass { + var model: _CharacterClassModel { switch self { - case .decimalDigit: return .digit - case .notDecimalDigit: return .digit.inverted + case .digit: return .digit + case .notDigit: return .digit.inverted case .horizontalWhitespace: return .horizontalWhitespace case .notHorizontalWhitespace: @@ -298,17 +249,14 @@ extension AST.Atom.EscapedBuiltin { case .whitespace: return .whitespace case .notWhitespace: return .whitespace.inverted - case .verticalTab: return .verticalWhitespace - case .notVerticalTab: return .verticalWhitespace.inverted - - case .wordCharacter: return .word - case .notWordCharacter: return .word.inverted + case .verticalWhitespace: return .verticalWhitespace + case .notVerticalWhitespace: return .verticalWhitespace.inverted - case .graphemeCluster: return .anyGrapheme - case .trueAnychar: return .anyUnicodeScalar + case .word: return .word + case .notWord: return .word.inverted - default: - return nil + case .anyGrapheme: return .anyGrapheme + case .anyUnicodeScalar: return .anyUnicodeScalar } } } From 2a6fe3c8ee084cb1a05a750b627e98e99c8bb299 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Mon, 18 Jul 2022 18:00:46 -0700 Subject: [PATCH 15/22] Bugfixes - matchBuiltin always fails if at endIndex - fix switch in isStrictAscii --- Sources/RegexBuilder/CharacterClass.swift | 14 +++++--------- Sources/_StringProcessing/Engine/MEBuiltins.swift | 4 ++-- .../_StringProcessing/_CharacterClassModel.swift | 2 +- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index 4cb0a5e42..08c7d347e 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -43,15 +43,11 @@ extension CharacterClass: RegexComponent { @available(SwiftStdlib 5.7, *) extension CharacterClass { public var inverted: CharacterClass { - return CharacterClass(ccc.inverted) - // lily fixme: this causes a precondition to fail in Capture.swift... why? - // why are the inverted builtins causing issues? - // Match tests are all passing -// if let inv = builtin?.inverted { -// return CharacterClass(builtin: inv) -// } else { -// return CharacterClass(ccc.inverted) -// } + if let inv = builtin?.inverted { + return CharacterClass(builtin: inv) + } else { + return CharacterClass(ccc.inverted) + } } } diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 4f14b0a06..21edc2ce4 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -8,7 +8,7 @@ extension Processor { ) -> Bool { guard let c = load() else { signalFailure() - return isInverted + return false } var matched: Bool @@ -50,7 +50,7 @@ extension Processor { ) -> Bool { guard let c = loadScalar() else { signalFailure() - return isInverted + return false } var matched: Bool diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 2431b3f45..b3fef17fb 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -70,7 +70,7 @@ struct _CharacterClassModel: Hashable { /// Returns true if this CharacterClass should be matched by strict ascii under the given options func isStrictAscii(options: MatchingOptions) -> Bool { - switch self { + switch self.cc { case .digit: return options.usesASCIIDigits case .horizontalWhitespace: return options.usesASCIISpaces case .newlineSequence: return options.usesASCIISpaces From 206bfc6deab2a1d82aaa452c2a06f3f2fdbd289a Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Thu, 21 Jul 2022 13:35:35 -0700 Subject: [PATCH 16/22] Add documentation for matchBuiltin --- Sources/_StringProcessing/Engine/Instruction.swift | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index ac63dc7f5..f2ee88636 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -113,6 +113,14 @@ extension Instruction { /// - Boolean for if we should match by scalar value case matchBitset + /// Match against a built-in character class + /// + /// matchBuiltin(_: CharacterClassPayload) + /// + /// Operand: the payload contains + /// - The character class + /// - If it is inverted + /// - If it strictly matches only ascii values case matchBuiltin // MARK: Extension points From b53f52481727baf8a92026828d3bb9fe9ca25835 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Mon, 25 Jul 2022 12:31:38 -0700 Subject: [PATCH 17/22] Lots of cleanup - static vars in payloads - Clean up _CharacterClassModel - Use the model for bytecodegen and consumer interface - Merge the grapheme and scalar match builtin cases together --- Sources/_StringProcessing/ByteCodeGen.swift | 5 +- .../_StringProcessing/ConsumerInterface.swift | 5 +- .../Engine/InstPayload.swift | 42 +-- .../_StringProcessing/Engine/MEBuilder.swift | 8 +- .../_StringProcessing/Engine/MEBuiltins.swift | 127 ++++---- .../_StringProcessing/Engine/Processor.swift | 15 +- .../_StringProcessing/MatchingOptions.swift | 12 - .../_CharacterClassModel.swift | 301 ++++++++---------- 8 files changed, 234 insertions(+), 281 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d6277d5c9..0003cfa5d 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -166,10 +166,7 @@ fileprivate extension Compiler.ByteCodeGen { } mutating func emitCharacterClass(_ cc: DSLTree.Atom.CharacterClass) { - builder.buildMatchBuiltin( - cc.model, - cc.model.isStrictAscii(options: options), - isScalar: options.semanticLevel == .unicodeScalar) + builder.buildMatchBuiltin(model: cc.asRuntimeModel(options)) } mutating func emitMatchScalar(_ s: UnicodeScalar) { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index b37b9341a..370c74e91 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -186,10 +186,9 @@ extension DSLTree.Atom { extension DSLTree.Atom.CharacterClass { func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction { + let model = asRuntimeModel(opts) return { input, bounds in - // FIXME: should we worry about out of bounds? - model.withMatchLevel(opts.matchLevel) - .matches(in: input, at: bounds.lowerBound, with: opts) + model.matches(in: input, at: bounds.lowerBound) } } } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 89b0d410b..3ff93e064 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -224,8 +224,8 @@ extension Instruction.Payload { return (isScalar: pair.0 == 1, pair.1) } - init(_ cc: _CharacterClassModel.Representation, _ isInverted: Bool, _ isStrict: Bool, _ isScalar: Bool) { - self.init(CharacterClassPayload(cc, isInverted, isStrict, isScalar).rawValue) + init(_ model: _CharacterClassModel) { + self.init(CharacterClassPayload(model).rawValue) } var characterClassPayload: CharacterClassPayload{ return CharacterClassPayload(rawValue: rawValue & _payloadMask) @@ -350,33 +350,37 @@ struct CharacterClassPayload: RawRepresentable { let rawValue: UInt64 // Layout: // Top three bits are isInverted, isStrict, isScalar - // Lower 16 bits are _CCM.Representation - static let invertedShift: UInt64 = 55 - static let strictShift: UInt64 = 54 - static let scalarShift: UInt64 = 53 - static let ccMask: UInt64 = 0xFF + // Lower 8 bits are _CCM.Representation + static var invertedBit: UInt64 { 1 << 55 } + static var strictASCIIBit: UInt64 { 1 << 54 } + static var scalarBit: UInt64 { 1 << 53 } + static var ccMask: UInt64 { 0xFF } init(rawValue: UInt64) { assert(rawValue & _opcodeMask == 0) self.rawValue = rawValue } - init(_ cc: _CharacterClassModel.Representation, _ isInverted: Bool, _ isStrict: Bool, _ isScalar: Bool) { - let invertedBit = isInverted ? 1 << CharacterClassPayload.invertedShift : 0 - let strictBit = isStrict ? 1 << CharacterClassPayload.strictShift : 0 - let scalarBit = isScalar ? 1 << CharacterClassPayload.scalarShift : 0 - assert(cc.rawValue <= CharacterClassPayload.ccMask) // - self.init(rawValue: cc.rawValue + UInt64(invertedBit) + UInt64(strictBit) + UInt64(scalarBit)) + init(_ model: _CharacterClassModel) { + let invertedBit = model.isInverted ? CharacterClassPayload.invertedBit : 0 + let strictASCIIBit = model.isStrictAscii ? CharacterClassPayload.strictASCIIBit : 0 + let scalarBit = model.matchLevel == .unicodeScalar ? CharacterClassPayload.scalarBit : 0 + assert(model.cc.rawValue <= CharacterClassPayload.ccMask) + assert(model.cc.rawValue & invertedBit & strictASCIIBit & scalarBit == 0) // Sanity check + self.init(rawValue: model.cc.rawValue | invertedBit | strictASCIIBit | scalarBit) } var isInverted: Bool { - (self.rawValue >> CharacterClassPayload.invertedShift) & 1 == 1 + self.rawValue & CharacterClassPayload.invertedBit != 0 } - var isStrict: Bool { - (self.rawValue >> CharacterClassPayload.strictShift) & 1 == 1 + /// Represents if the given character class should strictly only match ascii values based on the options given + /// See Oniguruma options: (?D) (?\P) (?S) (?W) + var isStrictASCII: Bool { + self.rawValue & CharacterClassPayload.strictASCIIBit != 0 } - var isScalar: Bool { - (self.rawValue >> CharacterClassPayload.scalarShift) & 1 == 1 + var isScalarSemantics: Bool { + self.rawValue & CharacterClassPayload.scalarBit != 0 } var cc: _CharacterClassModel.Representation { - _CharacterClassModel.Representation.init(rawValue: self.rawValue & CharacterClassPayload.ccMask)! + _CharacterClassModel.Representation.init( + rawValue: self.rawValue & CharacterClassPayload.ccMask).unsafelyUnwrapped } } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 421ed5da3..3406e9fed 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -171,13 +171,9 @@ extension MEProgram.Builder { .matchBitset, .init(bitset: makeAsciiBitset(b), isScalar: true))) } - mutating func buildMatchBuiltin( - _ cc: _CharacterClassModel, - _ isStrict: Bool, - isScalar: Bool - ) { + mutating func buildMatchBuiltin(model: _CharacterClassModel) { instructions.append(.init( - .matchBuiltin, .init(cc.cc, cc.isInverted, isStrict, isScalar))) + .matchBuiltin, .init(model))) } mutating func buildConsume( diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 21edc2ce4..55c77d72f 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -4,78 +4,66 @@ extension Processor { mutating func matchBuiltin( _ cc: _CharacterClassModel.Representation, _ isInverted: Bool, - _ isStrictAscii: Bool + _ isStrictASCII: Bool, + _ isScalarSemantics: Bool ) -> Bool { - guard let c = load() else { + guard let char = load(), let scalar = loadScalar() else { signalFailure() return false } + var asciiCheck: Bool { + (char.isASCII && !isScalarSemantics) + || (scalar.isASCII && isScalarSemantics) + || !isStrictASCII + } var matched: Bool - var next = input.index(after: currentPosition) - switch cc { - case .any, .anyGrapheme: matched = true - case .anyScalar: - matched = true + var next: Input.Index + if isScalarSemantics { next = input.unicodeScalars.index(after: currentPosition) - case .digit: - matched = c.isNumber && (c.isASCII || !isStrictAscii) - case .horizontalWhitespace: - matched = c.unicodeScalars.first?.isHorizontalWhitespace == true - && (c.isASCII || !isStrictAscii) - case .newlineSequence, .verticalWhitespace: - matched = c.unicodeScalars.first?.isNewline == true - && (c.isASCII || !isStrictAscii) - case .whitespace: - matched = c.isWhitespace && (c.isASCII || !isStrictAscii) - case .word: - matched = c.isWordCharacter && (c.isASCII || !isStrictAscii) - } - if isInverted { - matched.toggle() - } - if matched { - currentPosition = next - return true } else { - signalFailure() - return false - } - } - - mutating func matchBuiltinScalar( - _ cc: _CharacterClassModel.Representation, - _ isInverted: Bool, - _ isStrictAscii: Bool - ) -> Bool { - guard let c = loadScalar() else { - signalFailure() - return false + next = input.index(after: currentPosition) } - - var matched: Bool - var next = input.unicodeScalars.index(after: currentPosition) switch cc { - case .any: matched = true - case .anyScalar: matched = true + case .any: + matched = true case .anyGrapheme: matched = true next = input.index(after: currentPosition) + case .anyScalar: + // FIXME: This allows us to be not-scalar aligned when in grapheme mode + // Should this even be allowed? + matched = true + next = input.unicodeScalars.index(after: currentPosition) case .digit: - matched = c.properties.numericType != nil && (c.isASCII || !isStrictAscii) + if isScalarSemantics { + matched = scalar.properties.numericType != nil + } else { + matched = char.isNumber && asciiCheck + } case .horizontalWhitespace: - matched = c.isHorizontalWhitespace && (c.isASCII || !isStrictAscii) + matched = scalar.isHorizontalWhitespace && asciiCheck case .verticalWhitespace: - matched = c.isNewline && (c.isASCII || !isStrictAscii) + matched = scalar.isNewline && asciiCheck case .newlineSequence: - matched = c.isNewline && (c.isASCII || !isStrictAscii) - if c == "\r" && next != input.endIndex && input.unicodeScalars[next] == "\n" { + matched = scalar.isNewline && asciiCheck + if isScalarSemantics && matched && scalar == "\r" + && next != input.endIndex && input.unicodeScalars[next] == "\n" { + // Match a full CR-LF sequence even in scalar sematnics input.unicodeScalars.formIndex(after: &next) } case .whitespace: - matched = c.properties.isWhitespace && (c.isASCII || !isStrictAscii) + if isScalarSemantics { + matched = scalar.properties.isWhitespace && asciiCheck + } else { + matched = char.isWhitespace && asciiCheck + } case .word: - matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !isStrictAscii) + if isScalarSemantics { + matched = scalar.properties.isAlphabetic && asciiCheck + } else { + matched = char.isWordCharacter && asciiCheck + } } if isInverted { matched.toggle() @@ -176,9 +164,6 @@ extension Processor { } struct AssertionPayload: RawRepresentable { - var _assertionKindMask: UInt64 { ~0xFFF0_0000_0000_0000 } - var _opcodeMask: UInt64 { 0xFF00_0000_0000_0000 } - let rawValue: UInt64 init(rawValue: UInt64) { @@ -186,6 +171,12 @@ struct AssertionPayload: RawRepresentable { assert(rawValue & _opcodeMask == 0) } + static var anchorBit: UInt64 { 1 << 55 } + static var boundaryBit: UInt64 { 1 << 54 } + static var strictASCIIWordBit: UInt64 { 1 << 53 } + static var isScalarBit: UInt64 { 1 << 52 } + static var assertionKindMask: UInt64 { 0xFF } + init(_ assertion: DSLTree.Atom.Assertion, _ anchorsMatchNewlines: Bool, _ usesSimpleUnicodeBoundaries: Bool, @@ -193,26 +184,30 @@ struct AssertionPayload: RawRepresentable { _ semanticLevel: MatchingOptions.SemanticLevel ) { // 4 bits of options - let anchorBit: UInt64 = anchorsMatchNewlines ? (1 << 55) : 0 - let boundaryBit: UInt64 = usesSimpleUnicodeBoundaries ? (1 << 54) : 0 - let strictBit: UInt64 = usesASCIIWord ? (1 << 53) : 0 - let semanticLevelBit: UInt64 = semanticLevel == .unicodeScalar ? (1 << 52) : 0 - let optionsBits: UInt64 = anchorBit + boundaryBit + strictBit + semanticLevelBit + let anchorBit: UInt64 = anchorsMatchNewlines ? AssertionPayload.anchorBit : 0 + let boundaryBit: UInt64 = usesSimpleUnicodeBoundaries ? AssertionPayload.boundaryBit : 0 + let strictASCIIWordBit: UInt64 = usesASCIIWord ? AssertionPayload.strictASCIIWordBit : 0 + let isScalarBit: UInt64 = semanticLevel == .unicodeScalar ? AssertionPayload.isScalarBit : 0 - // 4 bits for the assertion kind + // 8 bits for the assertion kind // Future work: Optimize this layout let kind = assertion.rawValue - self.init(rawValue: kind + optionsBits) + assert(kind <= AssertionPayload.assertionKindMask) + assert(kind & anchorBit & boundaryBit & strictASCIIWordBit & isScalarBit == 0) + self.init(rawValue: kind | anchorBit | boundaryBit | strictASCIIWordBit | isScalarBit) } var kind: DSLTree.Atom.Assertion { - return .init(rawValue: self.rawValue & _assertionKindMask)! + return .init( + rawValue: self.rawValue & AssertionPayload.assertionKindMask).unsafelyUnwrapped + } + var anchorsMatchNewlines: Bool { self.rawValue & AssertionPayload.anchorBit != 0 } + var usesSimpleUnicodeBoundaries: Bool { + self.rawValue & AssertionPayload.boundaryBit != 0 } - var anchorsMatchNewlines: Bool { (self.rawValue >> 55) & 1 == 1 } - var usesSimpleUnicodeBoundaries: Bool { (self.rawValue >> 54) & 1 == 1 } - var usesASCIIWord: Bool { (self.rawValue >> 53) & 1 == 1 } + var usesASCIIWord: Bool { self.rawValue & AssertionPayload.strictASCIIWordBit != 0 } var semanticLevel: MatchingOptions.SemanticLevel { - if (self.rawValue >> 52) & 1 == 1 { + if self.rawValue & AssertionPayload.isScalarBit != 0 { return .unicodeScalar } else { return .graphemeCluster diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 0907164d8..55ac49ed9 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -479,14 +479,13 @@ extension Processor { case .matchBuiltin: let payload = payload.characterClassPayload - if payload.isScalar { - if matchBuiltinScalar(payload.cc, payload.isInverted, payload.isStrict) { - controller.step() - } - } else { - if matchBuiltin(payload.cc, payload.isInverted, payload.isStrict) { - controller.step() - } + if matchBuiltin( + payload.cc, + payload.isInverted, + payload.isStrictASCII, + payload.isScalarSemantics + ) { + controller.step() } case .consumeBy: diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index e56b8def2..d511c9f7c 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -122,18 +122,6 @@ extension MatchingOptions { } } -// Deprecated CharacterClass.MatchLevel API -extension MatchingOptions { - var matchLevel: _CharacterClassModel.MatchLevel { - switch semanticLevel { - case .graphemeCluster: - return .graphemeCluster - case .unicodeScalar: - return .unicodeScalar - } - } -} - // MARK: - Implementation extension MatchingOptions { /// An option that changes the behavior of a regular expression. diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index b3fef17fb..cdedf1530 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -17,16 +17,28 @@ struct _CharacterClassModel: Hashable { /// The actual character class to match. - var cc: Representation + let cc: Representation /// The level (character or Unicode scalar) at which to match. - var matchLevel: MatchLevel + let matchLevel: MatchingOptions.SemanticLevel + + /// If this character character class only matches ascii characters + let isStrictAscii: Bool /// Whether this character class matches against an inverse, /// e.g \D, \S, [^abc]. - var isInverted: Bool = false - - // TODO: Split out builtin character classes into their own type? + let isInverted: Bool + + init( + cc: Representation, + options: MatchingOptions, + isInverted: Bool + ) { + self.cc = cc + self.matchLevel = options.semanticLevel + self.isStrictAscii = cc.isStrictAscii(options: options) + self.isInverted = isInverted + } enum Representation: UInt64, Hashable { /// Any character case any = 0 @@ -48,45 +60,6 @@ struct _CharacterClassModel: Hashable { /// Character.isLetter or Character.isDigit or Character == "_" case word } - - enum MatchLevel: Hashable { - /// Match at the extended grapheme cluster level. - case graphemeCluster - /// Match at the Unicode scalar level. - case unicodeScalar - } - - var scalarSemantic: Self { - var result = self - result.matchLevel = .unicodeScalar - return result - } - - var graphemeClusterSemantic: Self { - var result = self - result.matchLevel = .graphemeCluster - return result - } - - /// Returns true if this CharacterClass should be matched by strict ascii under the given options - func isStrictAscii(options: MatchingOptions) -> Bool { - switch self.cc { - case .digit: return options.usesASCIIDigits - case .horizontalWhitespace: return options.usesASCIISpaces - case .newlineSequence: return options.usesASCIISpaces - case .verticalWhitespace: return options.usesASCIISpaces - case .whitespace: return options.usesASCIISpaces - case .word: return options.usesASCIIWord - default: return false - } - } - - /// Inverts a character class. - var inverted: Self { - var copy = self - copy.isInverted.toggle() - return copy - } /// Returns the end of the match of this character class in the string. /// @@ -94,106 +67,95 @@ struct _CharacterClassModel: Hashable { /// - Parameter at: The index to start matching. /// - Parameter options: Options for the match operation. /// - Returns: The index of the end of the match, or `nil` if there is no match. - func matches(in str: String, at i: String.Index, with options: MatchingOptions) -> String.Index? { + func matches( + in input: String, + at currentPosition: String.Index + ) -> String.Index? { // FIXME: This is only called in custom character classes that contain builtin // character classes as members (ie: [a\w] or set operations), is there // any way to avoid that? Can we remove this somehow? - switch matchLevel { - case .graphemeCluster: - let c = str[i] - var matched: Bool - var next = str.index(after: i) - switch cc { - case .any, .anyGrapheme: matched = true - case .anyScalar: - matched = true - next = str.unicodeScalars.index(after: i) - case .digit: - matched = c.isNumber && (c.isASCII || !options.usesASCIIDigits) - case .horizontalWhitespace: - matched = c.unicodeScalars.first?.isHorizontalWhitespace == true - && (c.isASCII || !options.usesASCIISpaces) - case .newlineSequence, .verticalWhitespace: - matched = c.unicodeScalars.first?.isNewline == true - && (c.isASCII || !options.usesASCIISpaces) - case .whitespace: - matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces) - case .word: - matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord) + guard currentPosition != input.endIndex else { + return nil + } + let char = input[currentPosition] + let scalar = input.unicodeScalars[currentPosition] + let isScalarSemantics = matchLevel == .unicodeScalar + var asciiCheck: Bool { + (char.isASCII && !isScalarSemantics) + || (scalar.isASCII && isScalarSemantics) + || !isStrictAscii + } + var matched: Bool + var next: String.Index + if isScalarSemantics { + next = input.unicodeScalars.index(after: currentPosition) + } else { + next = input.index(after: currentPosition) + } + switch cc { + case .any: + matched = true + case .anyGrapheme: + matched = true + next = input.index(after: currentPosition) + case .anyScalar: + // FIXME: This allows us to be not-scalar aligned when in grapheme mode + // Should this even be allowed? + matched = true + next = input.unicodeScalars.index(after: currentPosition) + case .digit: + if isScalarSemantics { + matched = scalar.properties.numericType != nil + } else { + matched = char.isNumber && asciiCheck } - if isInverted { - matched.toggle() + case .horizontalWhitespace: + matched = scalar.isHorizontalWhitespace && asciiCheck + case .verticalWhitespace: + matched = scalar.isNewline && asciiCheck + case .newlineSequence: + matched = scalar.isNewline && asciiCheck + if isScalarSemantics && matched && scalar == "\r" + && next != input.endIndex && input.unicodeScalars[next] == "\n" { + // Match a full CR-LF sequence even in scalar sematnics + input.unicodeScalars.formIndex(after: &next) } - return matched ? next : nil - case .unicodeScalar: - let c = str.unicodeScalars[i] - var nextIndex = str.unicodeScalars.index(after: i) - var matched: Bool - switch cc { - case .any: matched = true - case .anyScalar: matched = true - case .anyGrapheme: - matched = true - nextIndex = str.index(after: i) - case .digit: - matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits) - case .horizontalWhitespace: - matched = c.isHorizontalWhitespace && (c.isASCII || !options.usesASCIISpaces) - case .verticalWhitespace: - matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) - case .newlineSequence: - matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) - if c == "\r" && nextIndex != str.endIndex && str.unicodeScalars[nextIndex] == "\n" { - str.unicodeScalars.formIndex(after: &nextIndex) - } - case .whitespace: - matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces) - case .word: - matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord) + case .whitespace: + if isScalarSemantics { + matched = scalar.properties.isWhitespace && asciiCheck + } else { + matched = char.isWhitespace && asciiCheck } - if isInverted { - matched.toggle() + case .word: + if isScalarSemantics { + matched = scalar.properties.isAlphabetic && asciiCheck + } else { + matched = char.isWordCharacter && asciiCheck } - return matched ? nextIndex : nil + } + if isInverted { + matched.toggle() + } + if matched { + return next + } else { + return nil } } } -extension _CharacterClassModel { - static var any: _CharacterClassModel { - .init(cc: .any, matchLevel: .graphemeCluster) - } - - static var anyGrapheme: _CharacterClassModel { - .init(cc: .anyGrapheme, matchLevel: .graphemeCluster) - } - - static var anyUnicodeScalar: _CharacterClassModel { - .init(cc: .any, matchLevel: .unicodeScalar) - } - - static var whitespace: _CharacterClassModel { - .init(cc: .whitespace, matchLevel: .graphemeCluster) - } - - static var digit: _CharacterClassModel { - .init(cc: .digit, matchLevel: .graphemeCluster) - } - - static var horizontalWhitespace: _CharacterClassModel { - .init(cc: .horizontalWhitespace, matchLevel: .graphemeCluster) - } - - static var newlineSequence: _CharacterClassModel { - .init(cc: .newlineSequence, matchLevel: .graphemeCluster) - } - - static var verticalWhitespace: _CharacterClassModel { - .init(cc: .verticalWhitespace, matchLevel: .graphemeCluster) - } - - static var word: _CharacterClassModel { - .init(cc: .word, matchLevel: .graphemeCluster) +extension _CharacterClassModel.Representation { + /// Returns true if this CharacterClass should be matched by strict ascii under the given options + func isStrictAscii(options: MatchingOptions) -> Bool { + switch self { + case .digit: return options.usesASCIIDigits + case .horizontalWhitespace: return options.usesASCIISpaces + case .newlineSequence: return options.usesASCIISpaces + case .verticalWhitespace: return options.usesASCIISpaces + case .whitespace: return options.usesASCIISpaces + case .word: return options.usesASCIIWord + default: return false + } } } @@ -219,44 +181,57 @@ extension _CharacterClassModel: CustomStringConvertible { } } -extension _CharacterClassModel { - func withMatchLevel( - _ level: _CharacterClassModel.MatchLevel - ) -> _CharacterClassModel { - var cc = self - cc.matchLevel = level - return cc - } -} - extension DSLTree.Atom.CharacterClass { - var model: _CharacterClassModel { + /// Converts this DSLTree CharacterClass into our runtime representation + func asRuntimeModel(_ options: MatchingOptions) -> _CharacterClassModel { + let cc: _CharacterClassModel.Representation + var inverted = false switch self { - case .digit: return .digit - case .notDigit: return .digit.inverted - - case .horizontalWhitespace: return .horizontalWhitespace + case .digit: + cc = .digit + case .notDigit: + cc = .digit + inverted = true + + case .horizontalWhitespace: + cc = .horizontalWhitespace case .notHorizontalWhitespace: - return .horizontalWhitespace.inverted + cc = .horizontalWhitespace + inverted = true - case .newlineSequence: return .newlineSequence + case .newlineSequence: + cc = .newlineSequence // FIXME: This is more like '.' than inverted '\R', as it is affected // by e.g (*CR). We should therefore really be emitting it through // emitDot(). For now we treat it as semantically invalid. - case .notNewline: return .newlineSequence.inverted - - case .whitespace: return .whitespace - case .notWhitespace: return .whitespace.inverted - - case .verticalWhitespace: return .verticalWhitespace - case .notVerticalWhitespace: return .verticalWhitespace.inverted - - case .word: return .word - case .notWord: return .word.inverted - - case .anyGrapheme: return .anyGrapheme - case .anyUnicodeScalar: return .anyUnicodeScalar + case .notNewline: + cc = .newlineSequence + inverted = true + + case .whitespace: + cc = .whitespace + case .notWhitespace: + cc = .whitespace + inverted = true + + case .verticalWhitespace: + cc = .verticalWhitespace + case .notVerticalWhitespace: + cc = .verticalWhitespace + inverted = true + + case .word: + cc = .word + case .notWord: + cc = .word + inverted = true + + case .anyGrapheme: + cc = .anyGrapheme + case .anyUnicodeScalar: + cc = .anyScalar } + return _CharacterClassModel(cc: cc, options: options, isInverted: inverted) } } From bb5245fbc61f246ba02492f580bc02e8be4afed3 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Mon, 25 Jul 2022 12:35:19 -0700 Subject: [PATCH 18/22] Move assertion payload --- .../Engine/InstPayload.swift | 81 +++++++++++++++---- .../_StringProcessing/Engine/MEBuiltins.swift | 52 ------------ 2 files changed, 67 insertions(+), 66 deletions(-) diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 3ff93e064..7c81b1326 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -224,13 +224,6 @@ extension Instruction.Payload { return (isScalar: pair.0 == 1, pair.1) } - init(_ model: _CharacterClassModel) { - self.init(CharacterClassPayload(model).rawValue) - } - var characterClassPayload: CharacterClassPayload{ - return CharacterClassPayload(rawValue: rawValue & _payloadMask) - } - init(consumer: ConsumeFunctionRegister) { self.init(consumer) } @@ -238,13 +231,6 @@ extension Instruction.Payload { interpret() } - init(assertion payload: AssertionPayload) { - self.init(rawValue: payload.rawValue) - } - var assertion: AssertionPayload { - AssertionPayload.init(rawValue: self.rawValue & _payloadMask) - } - init(addr: InstructionAddress) { self.init(addr) } @@ -344,8 +330,23 @@ extension Instruction.Payload { ) { interpretPair() } + // MARK: Struct payloads + init(_ model: _CharacterClassModel) { + self.init(CharacterClassPayload(model).rawValue) + } + var characterClassPayload: CharacterClassPayload{ + return CharacterClassPayload(rawValue: rawValue & _payloadMask) + } + + init(assertion payload: AssertionPayload) { + self.init(rawValue: payload.rawValue) + } + var assertion: AssertionPayload { + AssertionPayload.init(rawValue: self.rawValue & _payloadMask) + } } +// MARK: Struct definitions struct CharacterClassPayload: RawRepresentable { let rawValue: UInt64 // Layout: @@ -384,3 +385,55 @@ struct CharacterClassPayload: RawRepresentable { rawValue: self.rawValue & CharacterClassPayload.ccMask).unsafelyUnwrapped } } + +struct AssertionPayload: RawRepresentable { + let rawValue: UInt64 + + init(rawValue: UInt64) { + self.rawValue = rawValue + assert(rawValue & _opcodeMask == 0) + } + + static var anchorBit: UInt64 { 1 << 55 } + static var boundaryBit: UInt64 { 1 << 54 } + static var strictASCIIWordBit: UInt64 { 1 << 53 } + static var isScalarBit: UInt64 { 1 << 52 } + static var assertionKindMask: UInt64 { 0xFF } + + init(_ assertion: DSLTree.Atom.Assertion, + _ anchorsMatchNewlines: Bool, + _ usesSimpleUnicodeBoundaries: Bool, + _ usesASCIIWord: Bool, + _ semanticLevel: MatchingOptions.SemanticLevel + ) { + // 4 bits of options + let anchorBit: UInt64 = anchorsMatchNewlines ? AssertionPayload.anchorBit : 0 + let boundaryBit: UInt64 = usesSimpleUnicodeBoundaries ? AssertionPayload.boundaryBit : 0 + let strictASCIIWordBit: UInt64 = usesASCIIWord ? AssertionPayload.strictASCIIWordBit : 0 + let isScalarBit: UInt64 = semanticLevel == .unicodeScalar ? AssertionPayload.isScalarBit : 0 + + // 8 bits for the assertion kind + // Future work: Optimize this layout + let kind = assertion.rawValue + assert(kind <= AssertionPayload.assertionKindMask) + assert(kind & anchorBit & boundaryBit & strictASCIIWordBit & isScalarBit == 0) + self.init(rawValue: kind | anchorBit | boundaryBit | strictASCIIWordBit | isScalarBit) + } + + var kind: DSLTree.Atom.Assertion { + return .init( + rawValue: self.rawValue & AssertionPayload.assertionKindMask).unsafelyUnwrapped + } + var anchorsMatchNewlines: Bool { self.rawValue & AssertionPayload.anchorBit != 0 } + var usesSimpleUnicodeBoundaries: Bool { + self.rawValue & AssertionPayload.boundaryBit != 0 + } + var usesASCIIWord: Bool { self.rawValue & AssertionPayload.strictASCIIWordBit != 0 } + var semanticLevel: MatchingOptions.SemanticLevel { + if self.rawValue & AssertionPayload.isScalarBit != 0 { + return .unicodeScalar + } else { + return .graphemeCluster + } + } +} diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 55c77d72f..18b397191 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -162,55 +162,3 @@ extension Processor { } } } - -struct AssertionPayload: RawRepresentable { - let rawValue: UInt64 - - init(rawValue: UInt64) { - self.rawValue = rawValue - assert(rawValue & _opcodeMask == 0) - } - - static var anchorBit: UInt64 { 1 << 55 } - static var boundaryBit: UInt64 { 1 << 54 } - static var strictASCIIWordBit: UInt64 { 1 << 53 } - static var isScalarBit: UInt64 { 1 << 52 } - static var assertionKindMask: UInt64 { 0xFF } - - init(_ assertion: DSLTree.Atom.Assertion, - _ anchorsMatchNewlines: Bool, - _ usesSimpleUnicodeBoundaries: Bool, - _ usesASCIIWord: Bool, - _ semanticLevel: MatchingOptions.SemanticLevel - ) { - // 4 bits of options - let anchorBit: UInt64 = anchorsMatchNewlines ? AssertionPayload.anchorBit : 0 - let boundaryBit: UInt64 = usesSimpleUnicodeBoundaries ? AssertionPayload.boundaryBit : 0 - let strictASCIIWordBit: UInt64 = usesASCIIWord ? AssertionPayload.strictASCIIWordBit : 0 - let isScalarBit: UInt64 = semanticLevel == .unicodeScalar ? AssertionPayload.isScalarBit : 0 - - // 8 bits for the assertion kind - // Future work: Optimize this layout - let kind = assertion.rawValue - assert(kind <= AssertionPayload.assertionKindMask) - assert(kind & anchorBit & boundaryBit & strictASCIIWordBit & isScalarBit == 0) - self.init(rawValue: kind | anchorBit | boundaryBit | strictASCIIWordBit | isScalarBit) - } - - var kind: DSLTree.Atom.Assertion { - return .init( - rawValue: self.rawValue & AssertionPayload.assertionKindMask).unsafelyUnwrapped - } - var anchorsMatchNewlines: Bool { self.rawValue & AssertionPayload.anchorBit != 0 } - var usesSimpleUnicodeBoundaries: Bool { - self.rawValue & AssertionPayload.boundaryBit != 0 - } - var usesASCIIWord: Bool { self.rawValue & AssertionPayload.strictASCIIWordBit != 0 } - var semanticLevel: MatchingOptions.SemanticLevel { - if self.rawValue & AssertionPayload.isScalarBit != 0 { - return .unicodeScalar - } else { - return .graphemeCluster - } - } -} From 0746847aa622ed5480b05b836ddf0df066d0bb78 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Mon, 25 Jul 2022 14:14:29 -0700 Subject: [PATCH 19/22] More minor cleanup --- .../Engine/InstPayload.swift | 2 +- .../_StringProcessing/Engine/MEBuiltins.swift | 74 ++++++++++++------- .../_CharacterClassModel.swift | 62 +++++++++------- 3 files changed, 86 insertions(+), 52 deletions(-) diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 7c81b1326..d6372c0ba 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -362,7 +362,7 @@ struct CharacterClassPayload: RawRepresentable { } init(_ model: _CharacterClassModel) { let invertedBit = model.isInverted ? CharacterClassPayload.invertedBit : 0 - let strictASCIIBit = model.isStrictAscii ? CharacterClassPayload.strictASCIIBit : 0 + let strictASCIIBit = model.isStrictASCII ? CharacterClassPayload.strictASCIIBit : 0 let scalarBit = model.matchLevel == .unicodeScalar ? CharacterClassPayload.scalarBit : 0 assert(model.cc.rawValue <= CharacterClassPayload.ccMask) assert(model.cc.rawValue & invertedBit & strictASCIIBit & scalarBit == 0) // Sanity check diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 18b397191..e55834148 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -1,5 +1,14 @@ @_implementationOnly import _RegexParser // For AssertionKind +extension Character { + var _isHorizontalWhitespace: Bool { + self.unicodeScalars.first?.isHorizontalWhitespace == true + } + var _isNewline: Bool { + self.unicodeScalars.first?.isNewline == true + } +} + extension Processor { mutating func matchBuiltin( _ cc: _CharacterClassModel.Representation, @@ -12,45 +21,56 @@ extension Processor { return false } - var asciiCheck: Bool { - (char.isASCII && !isScalarSemantics) + let asciiCheck = (char.isASCII && !isScalarSemantics) || (scalar.isASCII && isScalarSemantics) || !isStrictASCII - } + var matched: Bool var next: Input.Index - if isScalarSemantics { + switch (isScalarSemantics, cc) { + case (_, .anyGrapheme): + next = input.index(after: currentPosition) + case (_, .anyScalar): + // FIXME: This allows us to be not-scalar aligned when in grapheme mode + // Should this even be allowed? next = input.unicodeScalars.index(after: currentPosition) - } else { + case (true, _): + next = input.unicodeScalars.index(after: currentPosition) + case (false, _): next = input.index(after: currentPosition) } + switch cc { - case .any: - matched = true - case .anyGrapheme: - matched = true - next = input.index(after: currentPosition) - case .anyScalar: - // FIXME: This allows us to be not-scalar aligned when in grapheme mode - // Should this even be allowed? + case .any, .anyGrapheme, .anyScalar: matched = true - next = input.unicodeScalars.index(after: currentPosition) case .digit: if isScalarSemantics { - matched = scalar.properties.numericType != nil + matched = scalar.properties.numericType != nil && asciiCheck } else { matched = char.isNumber && asciiCheck } case .horizontalWhitespace: - matched = scalar.isHorizontalWhitespace && asciiCheck + if isScalarSemantics { + matched = scalar.isHorizontalWhitespace && asciiCheck + } else { + matched = char._isHorizontalWhitespace && asciiCheck + } case .verticalWhitespace: - matched = scalar.isNewline && asciiCheck + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + } else { + matched = char._isNewline && asciiCheck + } case .newlineSequence: - matched = scalar.isNewline && asciiCheck - if isScalarSemantics && matched && scalar == "\r" - && next != input.endIndex && input.unicodeScalars[next] == "\n" { - // Match a full CR-LF sequence even in scalar sematnics - input.unicodeScalars.formIndex(after: &next) + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + if matched && scalar == "\r" + && next != input.endIndex && input.unicodeScalars[next] == "\n" { + // Match a full CR-LF sequence even in scalar semantics + input.unicodeScalars.formIndex(after: &next) + } + } else { + matched = char._isNewline && asciiCheck } case .whitespace: if isScalarSemantics { @@ -65,16 +85,18 @@ extension Processor { matched = char.isWordCharacter && asciiCheck } } + if isInverted { matched.toggle() } - if matched { - currentPosition = next - return true - } else { + + guard matched else { signalFailure() return false } + + currentPosition = next + return true } func isAtStartOfLine(_ payload: AssertionPayload) -> Bool { diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index cdedf1530..3be26f27f 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -23,7 +23,7 @@ struct _CharacterClassModel: Hashable { let matchLevel: MatchingOptions.SemanticLevel /// If this character character class only matches ascii characters - let isStrictAscii: Bool + let isStrictASCII: Bool /// Whether this character class matches against an inverse, /// e.g \D, \S, [^abc]. @@ -36,9 +36,10 @@ struct _CharacterClassModel: Hashable { ) { self.cc = cc self.matchLevel = options.semanticLevel - self.isStrictAscii = cc.isStrictAscii(options: options) + self.isStrictASCII = cc.isStrictAscii(options: options) self.isInverted = isInverted } + enum Representation: UInt64, Hashable { /// Any character case any = 0 @@ -80,45 +81,56 @@ struct _CharacterClassModel: Hashable { let char = input[currentPosition] let scalar = input.unicodeScalars[currentPosition] let isScalarSemantics = matchLevel == .unicodeScalar - var asciiCheck: Bool { - (char.isASCII && !isScalarSemantics) + let asciiCheck = (char.isASCII && !isScalarSemantics) || (scalar.isASCII && isScalarSemantics) - || !isStrictAscii - } + || !isStrictASCII + var matched: Bool var next: String.Index - if isScalarSemantics { + switch (isScalarSemantics, cc) { + case (_, .anyGrapheme): + next = input.index(after: currentPosition) + case (_, .anyScalar): + // FIXME: This allows us to be not-scalar aligned when in grapheme mode + // Should this even be allowed? next = input.unicodeScalars.index(after: currentPosition) - } else { + case (true, _): + next = input.unicodeScalars.index(after: currentPosition) + case (false, _): next = input.index(after: currentPosition) } + switch cc { - case .any: - matched = true - case .anyGrapheme: + case .any, .anyGrapheme, .anyScalar: matched = true - next = input.index(after: currentPosition) - case .anyScalar: - // FIXME: This allows us to be not-scalar aligned when in grapheme mode - // Should this even be allowed? - matched = true - next = input.unicodeScalars.index(after: currentPosition) case .digit: if isScalarSemantics { - matched = scalar.properties.numericType != nil + matched = scalar.properties.numericType != nil && asciiCheck } else { matched = char.isNumber && asciiCheck } case .horizontalWhitespace: - matched = scalar.isHorizontalWhitespace && asciiCheck + if isScalarSemantics { + matched = scalar.isHorizontalWhitespace && asciiCheck + } else { + matched = char._isHorizontalWhitespace && asciiCheck + } case .verticalWhitespace: - matched = scalar.isNewline && asciiCheck + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + } else { + matched = char._isNewline && asciiCheck + } case .newlineSequence: - matched = scalar.isNewline && asciiCheck - if isScalarSemantics && matched && scalar == "\r" - && next != input.endIndex && input.unicodeScalars[next] == "\n" { - // Match a full CR-LF sequence even in scalar sematnics - input.unicodeScalars.formIndex(after: &next) + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + if matched && scalar == "\r" + && next != input.endIndex && input.unicodeScalars[next] == "\n" { + // Match a full CR-LF sequence even in scalar sematnics + input.unicodeScalars.formIndex(after: &next) + } + } else { + matched = char._isNewline && asciiCheck } case .whitespace: if isScalarSemantics { From c7185435e3015fcf2b4ec179449d20d8d2e560c8 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Mon, 25 Jul 2022 14:34:17 -0700 Subject: [PATCH 20/22] Perform boundary check for .anyScalar when in grapheme mode --- Sources/_StringProcessing/Engine/MEBuiltins.swift | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index e55834148..d05348893 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -31,8 +31,6 @@ extension Processor { case (_, .anyGrapheme): next = input.index(after: currentPosition) case (_, .anyScalar): - // FIXME: This allows us to be not-scalar aligned when in grapheme mode - // Should this even be allowed? next = input.unicodeScalars.index(after: currentPosition) case (true, _): next = input.unicodeScalars.index(after: currentPosition) @@ -41,8 +39,14 @@ extension Processor { } switch cc { - case .any, .anyGrapheme, .anyScalar: + case .any, .anyGrapheme: matched = true + case .anyScalar: + if isScalarSemantics { + matched = true + } else { + matched = input.isOnGraphemeClusterBoundary(next) + } case .digit: if isScalarSemantics { matched = scalar.properties.numericType != nil && asciiCheck From 3f0ece56155071cd1d247d86b385d58cfdaded85 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Mon, 25 Jul 2022 16:54:52 -0700 Subject: [PATCH 21/22] Emit custom character classes via saves and branches - Removes the main consumer interface for ccc - Removes a lot of the consumer interface code required for ccc - Adds an optimization for collecting the ascii parts of a ccc - Use normal matching code in a CCC --- Sources/_StringProcessing/ByteCodeGen.swift | 252 ++++++++++++++- .../_StringProcessing/ConsumerInterface.swift | 290 ++---------------- .../Engine/Instruction.swift | 8 +- .../_StringProcessing/Engine/MEBuilder.swift | 8 + .../_StringProcessing/Engine/Processor.swift | 4 + Sources/_StringProcessing/Regex/DSLTree.swift | 28 ++ .../_CharacterClassModel.swift | 93 ------ Tests/RegexTests/CompileTests.swift | 3 + 8 files changed, 323 insertions(+), 363 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 0003cfa5d..0408006c8 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -654,6 +654,141 @@ fileprivate extension Compiler.ByteCodeGen { builder.label(exit) } + + mutating func emitCharacterInCCC(_ c: Character) { + let isCaseInsensitive = options.isCaseInsensitive + switch options.semanticLevel { + case .graphemeCluster: + emitCharacter(c) + case .unicodeScalar: + let consumers = c.unicodeScalars.map { s in consumeScalar { + isCaseInsensitive + ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping + : $0 == s + }} + let consumer: MEProgram.ConsumeFunction = { input, bounds in + for fn in consumers { + if let idx = fn(input, bounds) { + return idx + } + } + return nil + } + builder.buildConsume(by: consumer) + } + } + + mutating func emitCCCMember( + _ member: DSLTree.CustomCharacterClass.Member + ) throws { + switch member { + case .atom(let atom): + switch atom { + case .char(let c): + emitCharacterInCCC(c) + case .scalar(let s): + emitCharacterInCCC(Character(s)) + default: + try emitAtom(atom) + } + case .custom(let ccc): + try emitCustomCharacterClass(ccc) + case .range, .quotedLiteral: + let consumer = try member.generateConsumer(options) + builder.buildConsume(by: consumer) + case .trivia: + return + // store current position r0 + // lhs + // store current position r1 + // restore to r0 position + // rhs + // cond branch if same position as r1 to end + // .invalid + // end: ... + case let .intersection(lhs, rhs): + let r0 = builder.makePositionRegister() + let r1 = builder.makePositionRegister() + let end = builder.makeAddress() + + builder.buildMoveCurrentPosition(into: r0) + try emitCustomCharacterClass(lhs) + builder.buildMoveCurrentPosition(into: r1) + + builder.buildRestorePosition(from: r0) + try emitCustomCharacterClass(rhs) + + builder.buildCondBranch(to: end, ifSamePositionAs: r1) + builder.buildFatalError() + builder.label(end) + + // store current position + // lhs + // save to end + // restore current position + // rhs + // clear, fail (since both succeeded) + // end: ... + case let .subtraction(lhs, rhs): + let r = builder.makePositionRegister() + let end = builder.makeAddress() + builder.buildMoveCurrentPosition(into: r) + try emitCustomCharacterClass(lhs) + builder.buildSave(end) + builder.buildRestorePosition(from: r) + try emitCustomCharacterClass(rhs) + builder.buildClear() + builder.buildFail() + builder.label(end) + + // lily fixme: this duplicates the code emission from rhs + // do we care? we could track the success/fail in registers + // and then emit a bunch of conditional branches to fail/success? + + // store current position + // save to lhsFail + // lhs + // save to rhsFail + // restore current position + // rhs + // both succeeded, clear both and fail + // rhsFail: clear, goto end + // lhsFail: + // restore current position + // rhs + // end: ... + case let .symmetricDifference(lhs, rhs): + let r = builder.makePositionRegister() + let lhsFail = builder.makeAddress() + let rhsFail = builder.makeAddress() + let end = builder.makeAddress() + + builder.buildMoveCurrentPosition(into: r) + builder.buildSave(lhsFail) // saves lhsFail + try emitCustomCharacterClass(lhs) + builder.buildSave(rhsFail) // saves rhsFail + + builder.buildRestorePosition(from: r) + try emitCustomCharacterClass(rhs) + // Both succeeded, fail + builder.buildClear() // clears save(to: rhsFail) + builder.buildClear() // clears save(to: lhsFail) + builder.buildFail() + + // rhsFail + builder.label(rhsFail) + builder.buildClear() // clears save(to: lhsFail) + builder.buildBranch(to: end) + + // lhsFail + builder.label(lhsFail) + builder.buildRestorePosition(from: r) + try emitCustomCharacterClass(rhs) + + // end + builder.label(end) + } + } mutating func emitCustomCharacterClass( _ ccc: DSLTree.CustomCharacterClass @@ -667,8 +802,95 @@ fileprivate extension Compiler.ByteCodeGen { } return } - let consumer = try ccc.generateConsumer(options) - builder.buildConsume(by: consumer) + let updatedCCC: DSLTree.CustomCharacterClass + if optimizationsEnabled { + updatedCCC = ccc.coalesedASCIIMembers(options) + } else { + updatedCCC = ccc + } + let filteredMembers = updatedCCC.members.filter({!$0.isOnlyTrivia}) + + if updatedCCC.isInverted { + // inverted + // custom character class: p0 | p1 | ... | pn + // Try each member to make sure they all fail + // save next_p1 + // + // clear, fail + // next_p1: + // save next_p2 + // + // clear fail + // next_p2: + // save next_p... + // + // clear fail + // ... + // next_pn: + // save done + // + // clear fail + // done: + // step forward by 1 + let done = builder.makeAddress() + for member in filteredMembers.dropLast() { + let next = builder.makeAddress() + builder.buildSave(next) + try emitCCCMember(member) + builder.buildClear() + builder.buildFail() + builder.label(next) + } + builder.buildSave(done) + try emitCCCMember(filteredMembers.last!) + builder.buildClear() + builder.buildFail() + builder.label(done) + + // Consume a single unit for the inverted ccc + switch options.semanticLevel { + case .graphemeCluster: + builder.buildAdvance(1) + case .unicodeScalar: + // TODO: builder.buildAdvanceUnicodeScalar(1) + builder.buildConsume { input, bounds in + input.unicodeScalars.index(after: bounds.lowerBound) + } + } + return + } + // non inverted CCC + // Custom character class: p0 | p1 | ... | pn + // Very similar to alternation, but we don't keep backtracking save points + // save next_p1 + // + // clear + // branch done + // next_p1: + // save next_p2 + // + // clear + // branch done + // next_p2: + // save next_p... + // + // clear + // branch done + // ... + // next_pn: + // + // done: + let done = builder.makeAddress() + for member in filteredMembers.dropLast() { + let next = builder.makeAddress() + builder.buildSave(next) + try emitCCCMember(member) + builder.buildClear() + builder.buildBranch(to: done) + builder.label(next) + } + try emitCCCMember(filteredMembers.last!) + builder.label(done) } @discardableResult @@ -793,8 +1015,8 @@ extension DSLTree.Node { case .consumer, .matcher: // Allow zero width consumers and matchers return false - case .customCharacterClass: - return true + case .customCharacterClass(let ccc): + return ccc.guaranteesForwardProgress case .quantification(let amount, _, let child): let (atLeast, _) = amount.ast.bounds return atLeast ?? 0 > 0 && child.guaranteesForwardProgress @@ -802,3 +1024,25 @@ extension DSLTree.Node { } } } + +extension DSLTree.CustomCharacterClass { + /// We allow trivia into CustomCharacterClass, which could result in a CCC that matches nothing + /// ie (?x)[ ] + var guaranteesForwardProgress: Bool { + for m in members { + switch m { + case .trivia: + continue + case let .intersection(lhs, rhs): + return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress + case let .subtraction(lhs, _): + return lhs.guaranteesForwardProgress + case let .symmetricDifference(lhs, rhs): + return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress + default: + return true + } + } + return false + } +} diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 370c74e91..3c3a9a2f3 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -18,87 +18,12 @@ extension Character { } } -extension DSLTree.Node { - /// Attempt to generate a consumer from this AST node - /// - /// A consumer is a Swift closure that matches against - /// the front of an input range - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction? { - switch self { - case .atom(let a): - return try a.generateConsumer(opts) - case .customCharacterClass(let ccc): - return try ccc.generateConsumer(opts) - - case .quotedLiteral: - // TODO: Should we handle this here? - return nil - - case let .convertedRegexLiteral(n, _): - return try n.generateConsumer(opts) - - case .orderedChoice, .conditional, .concatenation, - .capture, .nonCapturingGroup, - .quantification, .trivia, .empty, - .absentFunction: return nil - - case .consumer: - fatalError("FIXME: Is this where we handle them?") - case .matcher: - fatalError("FIXME: Is this where we handle them?") - case .characterPredicate: - fatalError("FIXME: Is this where we handle them?") - } - } -} - extension DSLTree._AST.Atom { var singleScalarASCIIValue: UInt8? { return ast.singleScalarASCIIValue } } -extension Character { - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction? { - let isCaseInsensitive = opts.isCaseInsensitive - switch opts.semanticLevel { - case .graphemeCluster: - return { input, bounds in - let low = bounds.lowerBound - if isCaseInsensitive && isCased { - return input[low].lowercased() == lowercased() - ? input.index(after: low) - : nil - } else { - return input[low] == self - ? input.index(after: low) - : nil - } - } - case .unicodeScalar: - // TODO: This should only be reachable from character class emission, can - // we guarantee that? Otherwise we'd want a different matching behavior. - let consumers = unicodeScalars.map { s in consumeScalar { - isCaseInsensitive - ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping - : $0 == s - }} - return { input, bounds in - for fn in consumers { - if let idx = fn(input, bounds) { - return idx - } - } - return nil - } - } - } -} - extension DSLTree.Atom { var singleScalarASCIIValue: UInt8? { switch self { @@ -113,82 +38,14 @@ extension DSLTree.Atom { } } - // TODO: If ByteCodeGen switches first, then this is unnecessary for - // top-level nodes, but it's also invoked for `.atom` members of a custom CC func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { switch self { - case let .char(c): - return try c.generateConsumer(opts) - - case let .scalar(s): - // A scalar always matches the same as a single scalar character. This - // means it must match a whole grapheme in grapheme semantic mode, but - // can match a single scalar in scalar semantic mode. - return try Character(s).generateConsumer(opts) - - case .any: - // FIXME: Should this be a total ordering? - if opts.semanticLevel == .graphemeCluster { - return { input, bounds in - input.index(after: bounds.lowerBound) - } - } else { - return consumeScalar { _ in - true - } - } - - case .anyNonNewline: - switch opts.semanticLevel { - case .graphemeCluster: - return { input, bounds in - input[bounds.lowerBound].isNewline - ? nil - : input.index(after: bounds.lowerBound) - } - case .unicodeScalar: - return { input, bounds in - input[bounds.lowerBound].isNewline - ? nil - : input.unicodeScalars.index(after: bounds.lowerBound) - } - } - - case .dot: - throw Unreachable(".atom(.dot) should be handled by emitDot") - - case .assertion: - // TODO: We could handle, should this be total? - return nil - case .characterClass(let cc): - return cc.generateConsumer(opts) - - case .backreference: - // TODO: Should we handle? - return nil - - case .symbolicReference: - // TODO: Should we handle? - return nil - - case .changeMatchingOptions: - // TODO: Should we handle? - return nil - case let .unconverted(a): return try a.ast.generateConsumer(opts) - } - - } -} - -extension DSLTree.Atom.CharacterClass { - func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction { - let model = asRuntimeModel(opts) - return { input, bounds in - model.matches(in: input, at: bounds.lowerBound) + default: + throw Unreachable("Should have been handled in bytecode gen") } } } @@ -281,47 +138,19 @@ extension AST.Atom { _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { switch kind { - case let .scalar(s): - assertionFailure( - "Should have been handled by tree conversion") - return consumeScalar { $0 == s.value } - - case let .char(c): - assertionFailure( - "Should have been handled by tree conversion") - - // TODO: Match level? - return { input, bounds in - let low = bounds.lowerBound - guard input[low] == c else { - return nil - } - return input.index(after: low) - } - case let .property(p): return try p.generateConsumer(opts) - + case let .namedCharacter(name): return consumeName(name, opts: opts) - - case .dot: - assertionFailure( - "Should have been handled by tree conversion") - fatalError(".atom(.dot) is handled in emitDot") - - case .caretAnchor, .dollarAnchor: - // handled in emitAssertion - return nil - case .escaped: - // handled in emitAssertion and emitCharacterClass - return nil - + case .scalarSequence, .keyboardControl, .keyboardMeta, .keyboardMetaControl, .backreference, .subpattern, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: // FIXME: implement return nil + default: + fatalError("All other cases should have been handled by ByteCodeGen or converted to a DSLNode") } } } @@ -354,18 +183,28 @@ extension DSLTree.CustomCharacterClass.Member { } return nil } - + func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { switch self { - case let .atom(a): - guard let c = try a.generateConsumer(opts) else { - throw Unsupported("Consumer for \(a)") + case .quotedLiteral(let s): + if opts.isCaseInsensitive { + return { input, bounds in + guard s.lowercased()._contains(input[bounds.lowerBound].lowercased()) else { + return nil + } + return input.index(after: bounds.lowerBound) + } + } else { + return { input, bounds in + guard s.contains(input[bounds.lowerBound]) else { + return nil + } + return input.index(after: bounds.lowerBound) + } } - return c - case let .range(low, high): - // TODO: + case .range(let low, let high): guard let lhs = low.literalCharacterValue else { throw Unsupported("\(low) in range") } @@ -398,65 +237,8 @@ extension DSLTree.CustomCharacterClass.Member { return nil } } - - case let .custom(ccc): - return try ccc.generateConsumer(opts) - - case let .intersection(lhs, rhs): - let lhs = try lhs.generateConsumer(opts) - let rhs = try rhs.generateConsumer(opts) - return { input, bounds in - if let lhsIdx = lhs(input, bounds), - let rhsIdx = rhs(input, bounds) - { - guard lhsIdx == rhsIdx else { - fatalError("TODO: What should we do here?") - } - return lhsIdx - } - return nil - } - - case let .subtraction(lhs, rhs): - let lhs = try lhs.generateConsumer(opts) - let rhs = try rhs.generateConsumer(opts) - return { input, bounds in - if let lhsIdx = lhs(input, bounds), - rhs(input, bounds) == nil - { - return lhsIdx - } - return nil - } - - case let .symmetricDifference(lhs, rhs): - let lhs = try lhs.generateConsumer(opts) - let rhs = try rhs.generateConsumer(opts) - return { input, bounds in - if let lhsIdx = lhs(input, bounds) { - return rhs(input, bounds) == nil ? lhsIdx : nil - } - return rhs(input, bounds) - } - case .quotedLiteral(let s): - if opts.isCaseInsensitive { - return { input, bounds in - guard s.lowercased()._contains(input[bounds.lowerBound].lowercased()) else { - return nil - } - return input.index(after: bounds.lowerBound) - } - } else { - return { input, bounds in - guard s.contains(input[bounds.lowerBound]) else { - return nil - } - return input.index(after: bounds.lowerBound) - } - } - case .trivia: - // TODO: Should probably strip this earlier... - return { _, _ in nil } + default: + fatalError("Unreachable: should have been handled by bytecodegen") } } } @@ -474,28 +256,6 @@ extension DSLTree.CustomCharacterClass { } ) } - - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { - // NOTE: Easy way to implement, obviously not performant - let consumers = try members.map { - try $0.generateConsumer(opts) - } - return { input, bounds in - for consumer in consumers { - if let idx = consumer(input, bounds) { - return isInverted ? nil : idx - } - } - if isInverted { - return opts.semanticLevel == .graphemeCluster - ? input.index(after: bounds.lowerBound) - : input.unicodeScalars.index(after: bounds.lowerBound) - } - return nil - } - } } // NOTE: Conveniences, though not most performant diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index f2ee88636..efaa6967f 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -45,6 +45,12 @@ extension Instruction { /// - Position register to move into case moveCurrentPosition + /// Set the current position to the value stored in the register + /// + /// restorePosition(from: PositionRegister) + /// Operands: + /// - Position register to read from + case restorePosition // MARK: General Purpose: Control flow /// Branch to a new instruction @@ -254,7 +260,7 @@ extension Instruction { ).unsafelyUnwrapped } set { - assert(newValue != .invalid, "consider hoisting this") + // assert(newValue != .invalid, "consider hoisting this") assert(newValue.rawValue < 256) self.rawValue &= ~_opcodeMask self.rawValue |= newValue.rawValue &<< 56 diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 3406e9fed..3fea72316 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -77,6 +77,10 @@ extension MEProgram.Builder { .init(instructions.endIndex - 1) } + mutating func buildFatalError() { + instructions.append(.init(.invalid)) + } + mutating func buildMoveImmediate( _ value: UInt64, into: IntRegister ) { @@ -246,6 +250,10 @@ extension MEProgram.Builder { mutating func buildMoveCurrentPosition(into r: PositionRegister) { instructions.append(.init(.moveCurrentPosition, .init(position: r))) } + + mutating func buildRestorePosition(from r: PositionRegister) { + instructions.append(.init(.restorePosition, .init(position: r))) + } mutating func buildBackreference( _ cap: CaptureRegister diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 55ac49ed9..23c58bf31 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -382,6 +382,10 @@ extension Processor { let reg = payload.position registers[reg] = currentPosition controller.step() + case .restorePosition: + let reg = payload.position + currentPosition = registers[reg] + controller.step() case .branch: controller.pc = payload.addr diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index b073511c1..bb17f467b 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -128,6 +128,23 @@ extension DSLTree { } } + func coalesedASCIIMembers(_ opts: MatchingOptions) -> CustomCharacterClass { + var ascii: [Member] = [] + var nonAscii: [Member] = [] + for member in members { + if member.asAsciiBitset(opts, false) != nil { + ascii.append(member) + } else { + nonAscii.append(member) + } + } + if ascii.isEmpty || nonAscii.isEmpty { return self } + return CustomCharacterClass(members: [ + .custom(CustomCharacterClass(members: ascii)), + .custom(CustomCharacterClass(members: nonAscii)) + ], isInverted: isInverted) + } + public init(members: [DSLTree.CustomCharacterClass.Member], isInverted: Bool = false) { self.members = members self.isInverted = isInverted @@ -158,6 +175,17 @@ extension DSLTree { indirect case intersection(CustomCharacterClass, CustomCharacterClass) indirect case subtraction(CustomCharacterClass, CustomCharacterClass) indirect case symmetricDifference(CustomCharacterClass, CustomCharacterClass) + + var isOnlyTrivia: Bool { + switch self { + case .custom(let ccc): + return ccc.members.all(\.isOnlyTrivia) + case .trivia: + return true + default: + return false + } + } } } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 3be26f27f..35c4b4752 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -61,99 +61,6 @@ struct _CharacterClassModel: Hashable { /// Character.isLetter or Character.isDigit or Character == "_" case word } - - /// Returns the end of the match of this character class in the string. - /// - /// - Parameter str: The string to match against. - /// - Parameter at: The index to start matching. - /// - Parameter options: Options for the match operation. - /// - Returns: The index of the end of the match, or `nil` if there is no match. - func matches( - in input: String, - at currentPosition: String.Index - ) -> String.Index? { - // FIXME: This is only called in custom character classes that contain builtin - // character classes as members (ie: [a\w] or set operations), is there - // any way to avoid that? Can we remove this somehow? - guard currentPosition != input.endIndex else { - return nil - } - let char = input[currentPosition] - let scalar = input.unicodeScalars[currentPosition] - let isScalarSemantics = matchLevel == .unicodeScalar - let asciiCheck = (char.isASCII && !isScalarSemantics) - || (scalar.isASCII && isScalarSemantics) - || !isStrictASCII - - var matched: Bool - var next: String.Index - switch (isScalarSemantics, cc) { - case (_, .anyGrapheme): - next = input.index(after: currentPosition) - case (_, .anyScalar): - // FIXME: This allows us to be not-scalar aligned when in grapheme mode - // Should this even be allowed? - next = input.unicodeScalars.index(after: currentPosition) - case (true, _): - next = input.unicodeScalars.index(after: currentPosition) - case (false, _): - next = input.index(after: currentPosition) - } - - switch cc { - case .any, .anyGrapheme, .anyScalar: - matched = true - case .digit: - if isScalarSemantics { - matched = scalar.properties.numericType != nil && asciiCheck - } else { - matched = char.isNumber && asciiCheck - } - case .horizontalWhitespace: - if isScalarSemantics { - matched = scalar.isHorizontalWhitespace && asciiCheck - } else { - matched = char._isHorizontalWhitespace && asciiCheck - } - case .verticalWhitespace: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - } else { - matched = char._isNewline && asciiCheck - } - case .newlineSequence: - if isScalarSemantics { - matched = scalar.isNewline && asciiCheck - if matched && scalar == "\r" - && next != input.endIndex && input.unicodeScalars[next] == "\n" { - // Match a full CR-LF sequence even in scalar sematnics - input.unicodeScalars.formIndex(after: &next) - } - } else { - matched = char._isNewline && asciiCheck - } - case .whitespace: - if isScalarSemantics { - matched = scalar.properties.isWhitespace && asciiCheck - } else { - matched = char.isWhitespace && asciiCheck - } - case .word: - if isScalarSemantics { - matched = scalar.properties.isAlphabetic && asciiCheck - } else { - matched = char.isWordCharacter && asciiCheck - } - } - if isInverted { - matched.toggle() - } - if matched { - return next - } else { - return nil - } - } } extension _CharacterClassModel.Representation { diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index a7c0ee531..f0b1bfb79 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -18,6 +18,7 @@ enum DecodedInstr { case invalid case moveImmediate case moveCurrentPosition + case restorePosition case branch case condBranchZeroElseDecrement case condBranchSamePosition @@ -62,6 +63,8 @@ extension DecodedInstr { return .moveImmediate case .moveCurrentPosition: return .moveCurrentPosition + case .restorePosition: + return .restorePosition case .branch: return .branch case .condBranchZeroElseDecrement: From 79aabab4aead87371acec56a36d031e0694e9783 Mon Sep 17 00:00:00 2001 From: Lily Lin Date: Mon, 25 Jul 2022 17:04:45 -0700 Subject: [PATCH 22/22] Add some comments --- Sources/_StringProcessing/ByteCodeGen.swift | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 0408006c8..9c93045b7 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -655,7 +655,9 @@ fileprivate extension Compiler.ByteCodeGen { builder.label(exit) } - mutating func emitCharacterInCCC(_ c: Character) { + /// A character in a custom character class should match all of it's component scalars + /// when in scalar semantic mode + mutating func emitCharacterInCustomCharacterClass(_ c: Character) { let isCaseInsensitive = options.isCaseInsensitive switch options.semanticLevel { case .graphemeCluster: @@ -678,16 +680,16 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitCCCMember( + mutating func emitCustomCharacterClassMember( _ member: DSLTree.CustomCharacterClass.Member ) throws { switch member { case .atom(let atom): switch atom { case .char(let c): - emitCharacterInCCC(c) + emitCharacterInCustomCharacterClass(c) case .scalar(let s): - emitCharacterInCCC(Character(s)) + emitCharacterInCustomCharacterClass(Character(s)) default: try emitAtom(atom) } @@ -836,13 +838,13 @@ fileprivate extension Compiler.ByteCodeGen { for member in filteredMembers.dropLast() { let next = builder.makeAddress() builder.buildSave(next) - try emitCCCMember(member) + try emitCustomCharacterClassMember(member) builder.buildClear() builder.buildFail() builder.label(next) } builder.buildSave(done) - try emitCCCMember(filteredMembers.last!) + try emitCustomCharacterClassMember(filteredMembers.last!) builder.buildClear() builder.buildFail() builder.label(done) @@ -884,12 +886,12 @@ fileprivate extension Compiler.ByteCodeGen { for member in filteredMembers.dropLast() { let next = builder.makeAddress() builder.buildSave(next) - try emitCCCMember(member) + try emitCustomCharacterClassMember(member) builder.buildClear() builder.buildBranch(to: done) builder.label(next) } - try emitCCCMember(filteredMembers.last!) + try emitCustomCharacterClassMember(filteredMembers.last!) builder.label(done) }