diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index ea52c28f3..08c7d347e 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -15,27 +15,39 @@ @available(SwiftStdlib 5.7, *) public struct CharacterClass { internal var ccc: DSLTree.CustomCharacterClass + /// The builtin character class, if this CharacterClass is representable by one + internal var builtin: DSLTree.Atom.CharacterClass? init(_ ccc: DSLTree.CustomCharacterClass) { self.ccc = ccc + self.builtin = nil } - init(unconverted atom: DSLTree._AST.Atom) { - self.ccc = .init(members: [.atom(.unconverted(atom))]) + init(builtin: DSLTree.Atom.CharacterClass) { + self.ccc = .init(members: [.atom(.characterClass(builtin))]) + self.builtin = builtin } } @available(SwiftStdlib 5.7, *) extension CharacterClass: RegexComponent { public var regex: Regex { - _RegexFactory().customCharacterClass(ccc) + if let cc = builtin { + return _RegexFactory().characterClass(cc) + } else { + return _RegexFactory().customCharacterClass(ccc) + } } } @available(SwiftStdlib 5.7, *) extension CharacterClass { public var inverted: CharacterClass { - CharacterClass(ccc.inverted) + if let inv = builtin?.inverted { + return CharacterClass(builtin: inv) + } else { + return CharacterClass(ccc.inverted) + } } } @@ -50,15 +62,15 @@ extension RegexComponent where Self == CharacterClass { } public static var anyGraphemeCluster: CharacterClass { - .init(unconverted: ._anyGrapheme) + .init(builtin: .anyGrapheme) } public static var whitespace: CharacterClass { - .init(unconverted: ._whitespace) + .init(builtin: .whitespace) } public static var digit: CharacterClass { - .init(unconverted: ._digit) + .init(builtin: .digit) } public static var hexDigit: CharacterClass { @@ -70,19 +82,19 @@ extension RegexComponent where Self == CharacterClass { } public static var horizontalWhitespace: CharacterClass { - .init(unconverted: ._horizontalWhitespace) + .init(builtin: .horizontalWhitespace) } public static var newlineSequence: CharacterClass { - .init(unconverted: ._newlineSequence) + .init(builtin: .newlineSequence) } public static var verticalWhitespace: CharacterClass { - .init(unconverted: ._verticalWhitespace) + .init(builtin: .verticalWhitespace) } public static var word: CharacterClass { - .init(unconverted: ._word) + .init(builtin: .word) } } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index e8c92f2b5..da0888039 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -74,6 +74,9 @@ fileprivate extension Compiler.ByteCodeGen { emitMatchScalar(s) } + case let .characterClass(cc): + emitCharacterClass(cc) + case let .assertion(kind): try emitAssertion(kind) @@ -148,147 +151,24 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitStartOfLine() { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.lowerBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[input.index(before: pos)].isNewline - case .unicodeScalar: - return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline - } - } - } - - mutating func emitEndOfLine() { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[pos].isNewline - case .unicodeScalar: - return input.unicodeScalars[pos].isNewline - } - } - } - mutating func emitAssertion( _ kind: DSLTree.Atom.Assertion ) throws { - // FIXME: Depends on API model we have... We may want to - // think through some of these with API interactions in mind - // - // This might break how we use `bounds` for both slicing - // and things like `firstIndex`, that is `firstIndex` may - // need to supply both a slice bounds and a per-search bounds. - switch kind { - case .startOfSubject: - builder.buildAssert { (_, _, input, pos, subjectBounds) in - pos == subjectBounds.lowerBound - } - - case .endOfSubjectBeforeNewline: - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input.index(after: pos) == subjectBounds.upperBound - && input[pos].isNewline - case .unicodeScalar: - return input.unicodeScalars.index(after: pos) == subjectBounds.upperBound - && input.unicodeScalars[pos].isNewline - } - } - - case .endOfSubject: - builder.buildAssert { (_, _, input, pos, subjectBounds) in - pos == subjectBounds.upperBound - } - - case .resetStartOfMatch: - // FIXME: Figure out how to communicate this out + if kind == .resetStartOfMatch { throw Unsupported(#"\K (reset/keep assertion)"#) - - case .firstMatchingPositionInSubject: - // TODO: We can probably build a nice model with API here - - // FIXME: This needs to be based on `searchBounds`, - // not the `subjectBounds` given as an argument here - builder.buildAssert { (_, _, input, pos, subjectBounds) in false } - - case .textSegment: - builder.buildAssert { (_, _, input, pos, _) in - // FIXME: Grapheme or word based on options - input.isOnGraphemeClusterBoundary(pos) - } - - case .notTextSegment: - builder.buildAssert { (_, _, input, pos, _) in - // FIXME: Grapheme or word based on options - !input.isOnGraphemeClusterBoundary(pos) - } - - case .startOfLine: - emitStartOfLine() - - case .endOfLine: - emitEndOfLine() - - case .caretAnchor: - if options.anchorsMatchNewlines { - emitStartOfLine() - } else { - builder.buildAssert { (_, _, input, pos, subjectBounds) in - pos == subjectBounds.lowerBound - } - } - - case .dollarAnchor: - if options.anchorsMatchNewlines { - emitEndOfLine() - } else { - builder.buildAssert { (_, _, input, pos, subjectBounds) in - pos == subjectBounds.upperBound - } - } - - case .wordBoundary: - builder.buildAssert { [options] - (cache, maxIndex, input, pos, subjectBounds) in - if options.usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? - return _CharacterClassModel.word.isBoundary( - input, - at: pos, - bounds: subjectBounds, - with: options - ) - } else { - return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex) - } - } - - case .notWordBoundary: - builder.buildAssert { [options] - (cache, maxIndex, input, pos, subjectBounds) in - if options.usesSimpleUnicodeBoundaries { - // TODO: How should we handle bounds? - return !_CharacterClassModel.word.isBoundary( - input, - at: pos, - bounds: subjectBounds, - with: options - ) - } else { - return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex) - } - } } + builder.buildAssert( + by: kind, + options.anchorsMatchNewlines, + options.usesSimpleUnicodeBoundaries, + options.usesASCIIWord, + options.semanticLevel) } - + + mutating func emitCharacterClass(_ cc: DSLTree.Atom.CharacterClass) { + builder.buildMatchBuiltin(model: cc.asRuntimeModel(options)) + } + mutating func emitMatchScalar(_ s: UnicodeScalar) { assert(options.semanticLevel == .unicodeScalar) if options.isCaseInsensitive && s.properties.isCased { @@ -907,10 +787,10 @@ fileprivate extension Compiler.ByteCodeGen { } else { builder.buildMatchAsciiBitset(asciiBitset) } - } else { - let consumer = try ccc.generateConsumer(options) - builder.buildConsume(by: consumer) + return } + let consumer = try ccc.generateConsumer(options) + builder.buildConsume(by: consumer) } mutating func emitConcatenation(_ children: [DSLTree.Node]) throws { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 083781120..3a2731b0a 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -162,6 +162,8 @@ extension DSLTree.Atom { case .assertion: // TODO: We could handle, should this be total? return nil + case .characterClass(let cc): + return cc.generateConsumer(opts) case .backreference: // TODO: Should we handle? @@ -182,6 +184,15 @@ extension DSLTree.Atom { } } +extension DSLTree.Atom.CharacterClass { + func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction { + let model = asRuntimeModel(opts) + return { input, bounds in + model.matches(in: input, at: bounds.lowerBound) + } + } +} + extension String { /// Compares this string to `other` using the loose matching rule UAX44-LM2, /// which ignores case, whitespace, underscores, and nearly all medial @@ -269,16 +280,6 @@ extension AST.Atom { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { - // TODO: Wean ourselves off of this type... - if let cc = self.characterClass?.withMatchLevel( - opts.matchLevel - ) { - return { input, bounds in - // FIXME: should we worry about out of bounds? - cc.matches(in: input, at: bounds.lowerBound, with: opts) - } - } - switch kind { case let .scalar(s): assertionFailure( @@ -312,8 +313,11 @@ extension AST.Atom { case .caretAnchor, .dollarAnchor: // handled in emitAssertion return nil + case .escaped: + // handled in emitAssertion and emitCharacterClass + return nil - case .scalarSequence, .escaped, .keyboardControl, .keyboardMeta, + case .scalarSequence, .keyboardControl, .keyboardMeta, .keyboardMetaControl, .backreference, .subpattern, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: // FIXME: implement diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 42fb86913..d6372c0ba 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -9,7 +9,6 @@ // //===----------------------------------------------------------------------===// - extension Instruction { /// An instruction's payload packs operands and destination /// registers. @@ -51,7 +50,6 @@ extension Instruction.Payload { case element(ElementRegister) case consumer(ConsumeFunctionRegister) case bitset(AsciiBitsetRegister) - case assertion(AssertionFunctionRegister) case addr(InstructionAddress) case capture(CaptureRegister) @@ -225,7 +223,7 @@ extension Instruction.Payload { let pair: (UInt64, AsciiBitsetRegister) = interpretPair() return (isScalar: pair.0 == 1, pair.1) } - + init(consumer: ConsumeFunctionRegister) { self.init(consumer) } @@ -233,13 +231,6 @@ extension Instruction.Payload { interpret() } - init(assertion: AssertionFunctionRegister) { - self.init(assertion) - } - var assertion: AssertionFunctionRegister { - interpret() - } - init(addr: InstructionAddress) { self.init(addr) } @@ -339,5 +330,110 @@ extension Instruction.Payload { ) { interpretPair() } + // MARK: Struct payloads + init(_ model: _CharacterClassModel) { + self.init(CharacterClassPayload(model).rawValue) + } + var characterClassPayload: CharacterClassPayload{ + return CharacterClassPayload(rawValue: rawValue & _payloadMask) + } + + init(assertion payload: AssertionPayload) { + self.init(rawValue: payload.rawValue) + } + var assertion: AssertionPayload { + AssertionPayload.init(rawValue: self.rawValue & _payloadMask) + } } +// MARK: Struct definitions +struct CharacterClassPayload: RawRepresentable { + let rawValue: UInt64 + // Layout: + // Top three bits are isInverted, isStrict, isScalar + // Lower 8 bits are _CCM.Representation + static var invertedBit: UInt64 { 1 << 55 } + static var strictASCIIBit: UInt64 { 1 << 54 } + static var scalarBit: UInt64 { 1 << 53 } + static var ccMask: UInt64 { 0xFF } + init(rawValue: UInt64) { + assert(rawValue & _opcodeMask == 0) + self.rawValue = rawValue + } + init(_ model: _CharacterClassModel) { + let invertedBit = model.isInverted ? CharacterClassPayload.invertedBit : 0 + let strictASCIIBit = model.isStrictASCII ? CharacterClassPayload.strictASCIIBit : 0 + let scalarBit = model.matchLevel == .unicodeScalar ? CharacterClassPayload.scalarBit : 0 + assert(model.cc.rawValue <= CharacterClassPayload.ccMask) + assert(model.cc.rawValue & invertedBit & strictASCIIBit & scalarBit == 0) // Sanity check + self.init(rawValue: model.cc.rawValue | invertedBit | strictASCIIBit | scalarBit) + } + + var isInverted: Bool { + self.rawValue & CharacterClassPayload.invertedBit != 0 + } + /// Represents if the given character class should strictly only match ascii values based on the options given + /// See Oniguruma options: (?D) (?\P) (?S) (?W) + var isStrictASCII: Bool { + self.rawValue & CharacterClassPayload.strictASCIIBit != 0 + } + var isScalarSemantics: Bool { + self.rawValue & CharacterClassPayload.scalarBit != 0 + } + var cc: _CharacterClassModel.Representation { + _CharacterClassModel.Representation.init( + rawValue: self.rawValue & CharacterClassPayload.ccMask).unsafelyUnwrapped + } +} + +struct AssertionPayload: RawRepresentable { + let rawValue: UInt64 + + init(rawValue: UInt64) { + self.rawValue = rawValue + assert(rawValue & _opcodeMask == 0) + } + + static var anchorBit: UInt64 { 1 << 55 } + static var boundaryBit: UInt64 { 1 << 54 } + static var strictASCIIWordBit: UInt64 { 1 << 53 } + static var isScalarBit: UInt64 { 1 << 52 } + static var assertionKindMask: UInt64 { 0xFF } + + init(_ assertion: DSLTree.Atom.Assertion, + _ anchorsMatchNewlines: Bool, + _ usesSimpleUnicodeBoundaries: Bool, + _ usesASCIIWord: Bool, + _ semanticLevel: MatchingOptions.SemanticLevel + ) { + // 4 bits of options + let anchorBit: UInt64 = anchorsMatchNewlines ? AssertionPayload.anchorBit : 0 + let boundaryBit: UInt64 = usesSimpleUnicodeBoundaries ? AssertionPayload.boundaryBit : 0 + let strictASCIIWordBit: UInt64 = usesASCIIWord ? AssertionPayload.strictASCIIWordBit : 0 + let isScalarBit: UInt64 = semanticLevel == .unicodeScalar ? AssertionPayload.isScalarBit : 0 + + // 8 bits for the assertion kind + // Future work: Optimize this layout + let kind = assertion.rawValue + assert(kind <= AssertionPayload.assertionKindMask) + assert(kind & anchorBit & boundaryBit & strictASCIIWordBit & isScalarBit == 0) + self.init(rawValue: kind | anchorBit | boundaryBit | strictASCIIWordBit | isScalarBit) + } + + var kind: DSLTree.Atom.Assertion { + return .init( + rawValue: self.rawValue & AssertionPayload.assertionKindMask).unsafelyUnwrapped + } + var anchorsMatchNewlines: Bool { self.rawValue & AssertionPayload.anchorBit != 0 } + var usesSimpleUnicodeBoundaries: Bool { + self.rawValue & AssertionPayload.boundaryBit != 0 + } + var usesASCIIWord: Bool { self.rawValue & AssertionPayload.strictASCIIWordBit != 0 } + var semanticLevel: MatchingOptions.SemanticLevel { + if self.rawValue & AssertionPayload.isScalarBit != 0 { + return .unicodeScalar + } else { + return .graphemeCluster + } + } +} diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 8e1a1f294..f2ee88636 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -113,11 +113,15 @@ extension Instruction { /// - Boolean for if we should match by scalar value case matchBitset - /// TODO: builtin assertions and anchors - case builtinAssertion - - /// TODO: builtin character classes - case builtinCharacterClass + /// Match against a built-in character class + /// + /// matchBuiltin(_: CharacterClassPayload) + /// + /// Operand: the payload contains + /// - The character class + /// - If it is inverted + /// - If it strictly matches only ascii values + case matchBuiltin // MARK: Extension points @@ -127,16 +131,12 @@ extension Instruction { /// Operand: Consume function register to call. case consumeBy - /// Custom lookaround assertion operation. - /// Triggers a failure if customFunction returns false. + /// Lookaround assertion operation. Performs a zero width assertion based on + /// the assertion type and options stored in the payload /// - /// assert(_ customFunction: ( - /// input: Input, - /// currentPos: Position, - /// bounds: Range - /// ) -> Bool) + /// assert(_:AssertionPayload) /// - /// Operands: destination bool register, assert hook register + /// Operands: AssertionPayload containing assertion type and options case assertBy /// Custom value-creating consume operation. diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 0b9a91726..3406e9fed 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -20,7 +20,6 @@ extension MEProgram { var asciiBitsets: [DSLTree.CustomCharacterClass.AsciiBitset] = [] var consumeFunctions: [ConsumeFunction] = [] - var assertionFunctions: [AssertionFunction] = [] var transformFunctions: [TransformFunction] = [] var matcherFunctions: [MatcherFunction] = [] @@ -171,6 +170,11 @@ extension MEProgram.Builder { instructions.append(.init( .matchBitset, .init(bitset: makeAsciiBitset(b), isScalar: true))) } + + mutating func buildMatchBuiltin(model: _CharacterClassModel) { + instructions.append(.init( + .matchBuiltin, .init(model))) + } mutating func buildConsume( by p: @escaping MEProgram.ConsumeFunction @@ -180,10 +184,21 @@ extension MEProgram.Builder { } mutating func buildAssert( - by p: @escaping MEProgram.AssertionFunction + by kind: DSLTree.Atom.Assertion, + _ anchorsMatchNewlines: Bool, + _ usesSimpleUnicodeBoundaries: Bool, + _ usesASCIIWord: Bool, + _ semanticLevel: MatchingOptions.SemanticLevel ) { + let payload = AssertionPayload.init( + kind, + anchorsMatchNewlines, + usesSimpleUnicodeBoundaries, + usesASCIIWord, + semanticLevel) instructions.append(.init( - .assertBy, .init(assertion: makeAssertionFunction(p)))) + .assertBy, + .init(assertion: payload))) } mutating func buildAccept() { @@ -306,7 +321,6 @@ extension MEProgram.Builder { regInfo.positions = nextPositionRegister.rawValue regInfo.bitsets = asciiBitsets.count regInfo.consumeFunctions = consumeFunctions.count - regInfo.assertionFunctions = assertionFunctions.count regInfo.transformFunctions = transformFunctions.count regInfo.matcherFunctions = matcherFunctions.count regInfo.captures = nextCaptureRegister.rawValue @@ -317,7 +331,6 @@ extension MEProgram.Builder { staticSequences: sequences.stored, staticBitsets: asciiBitsets, staticConsumeFunctions: consumeFunctions, - staticAssertionFunctions: assertionFunctions, staticTransformFunctions: transformFunctions, staticMatcherFunctions: matcherFunctions, registerInfo: regInfo, @@ -466,12 +479,6 @@ extension MEProgram.Builder { defer { consumeFunctions.append(f) } return ConsumeFunctionRegister(consumeFunctions.count) } - mutating func makeAssertionFunction( - _ f: @escaping MEProgram.AssertionFunction - ) -> AssertionFunctionRegister { - defer { assertionFunctions.append(f) } - return AssertionFunctionRegister(assertionFunctions.count) - } mutating func makeTransformFunction( _ f: @escaping MEProgram.TransformFunction ) -> TransformRegister { diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index f791da37e..d05348893 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -1,13 +1,190 @@ +@_implementationOnly import _RegexParser // For AssertionKind +extension Character { + var _isHorizontalWhitespace: Bool { + self.unicodeScalars.first?.isHorizontalWhitespace == true + } + var _isNewline: Bool { + self.unicodeScalars.first?.isNewline == true + } +} extension Processor { + mutating func matchBuiltin( + _ cc: _CharacterClassModel.Representation, + _ isInverted: Bool, + _ isStrictASCII: Bool, + _ isScalarSemantics: Bool + ) -> Bool { + guard let char = load(), let scalar = loadScalar() else { + signalFailure() + return false + } + + let asciiCheck = (char.isASCII && !isScalarSemantics) + || (scalar.isASCII && isScalarSemantics) + || !isStrictASCII + + var matched: Bool + var next: Input.Index + switch (isScalarSemantics, cc) { + case (_, .anyGrapheme): + next = input.index(after: currentPosition) + case (_, .anyScalar): + next = input.unicodeScalars.index(after: currentPosition) + case (true, _): + next = input.unicodeScalars.index(after: currentPosition) + case (false, _): + next = input.index(after: currentPosition) + } + switch cc { + case .any, .anyGrapheme: + matched = true + case .anyScalar: + if isScalarSemantics { + matched = true + } else { + matched = input.isOnGraphemeClusterBoundary(next) + } + case .digit: + if isScalarSemantics { + matched = scalar.properties.numericType != nil && asciiCheck + } else { + matched = char.isNumber && asciiCheck + } + case .horizontalWhitespace: + if isScalarSemantics { + matched = scalar.isHorizontalWhitespace && asciiCheck + } else { + matched = char._isHorizontalWhitespace && asciiCheck + } + case .verticalWhitespace: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + } else { + matched = char._isNewline && asciiCheck + } + case .newlineSequence: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + if matched && scalar == "\r" + && next != input.endIndex && input.unicodeScalars[next] == "\n" { + // Match a full CR-LF sequence even in scalar semantics + input.unicodeScalars.formIndex(after: &next) + } + } else { + matched = char._isNewline && asciiCheck + } + case .whitespace: + if isScalarSemantics { + matched = scalar.properties.isWhitespace && asciiCheck + } else { + matched = char.isWhitespace && asciiCheck + } + case .word: + if isScalarSemantics { + matched = scalar.properties.isAlphabetic && asciiCheck + } else { + matched = char.isWordCharacter && asciiCheck + } + } - mutating func builtinAssertion() { - fatalError("TODO: assertions and anchors") + if isInverted { + matched.toggle() + } + + guard matched else { + signalFailure() + return false + } + + currentPosition = next + return true + } + + func isAtStartOfLine(_ payload: AssertionPayload) -> Bool { + if currentPosition == subjectBounds.lowerBound { return true } + switch payload.semanticLevel { + case .graphemeCluster: + return input[input.index(before: currentPosition)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline + } } + + func isAtEndOfLine(_ payload: AssertionPayload) -> Bool { + if currentPosition == subjectBounds.upperBound { return true } + switch payload.semanticLevel { + case .graphemeCluster: + return input[currentPosition].isNewline + case .unicodeScalar: + return input.unicodeScalars[currentPosition].isNewline + } + } + + mutating func builtinAssert(by payload: AssertionPayload) throws -> Bool { + // Future work: Optimize layout and dispatch + switch payload.kind { + case .startOfSubject: return currentPosition == subjectBounds.lowerBound + + case .endOfSubjectBeforeNewline: + if currentPosition == subjectBounds.upperBound { return true } + switch payload.semanticLevel { + case .graphemeCluster: + return input.index(after: currentPosition) == subjectBounds.upperBound + && input[currentPosition].isNewline + case .unicodeScalar: + return input.unicodeScalars.index(after: currentPosition) == subjectBounds.upperBound + && input.unicodeScalars[currentPosition].isNewline + } + + case .endOfSubject: return currentPosition == subjectBounds.upperBound + + case .resetStartOfMatch: + fatalError("Unreachable, we should have thrown an error during compilation") + + case .firstMatchingPositionInSubject: + return currentPosition == searchBounds.lowerBound + + case .textSegment: return input.isOnGraphemeClusterBoundary(currentPosition) + + case .notTextSegment: return !input.isOnGraphemeClusterBoundary(currentPosition) + + case .startOfLine: + return isAtStartOfLine(payload) + case .endOfLine: + return isAtEndOfLine(payload) + + case .caretAnchor: + if payload.anchorsMatchNewlines { + return isAtStartOfLine(payload) + } else { + return currentPosition == subjectBounds.lowerBound + } + + case .dollarAnchor: + if payload.anchorsMatchNewlines { + return isAtEndOfLine(payload) + } else { + return currentPosition == subjectBounds.upperBound + } + + case .wordBoundary: + if payload.usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) + } else { + return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + } - mutating func builtinCharacterClass() { - fatalError("TODO: character classes") + case .notWordBoundary: + if payload.usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return !atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel) + } else { + return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex) + } + } } } diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index d311b4465..bacefb209 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -15,14 +15,6 @@ struct MEProgram { typealias Input = String typealias ConsumeFunction = (Input, Range) -> Input.Index? - typealias AssertionFunction = - ( - inout Set?, - inout String.Index?, - Input, - Input.Index, - Range - ) throws -> Bool typealias TransformFunction = (Input, Processor._StoredCapture) throws -> Any? typealias MatcherFunction = @@ -34,7 +26,6 @@ struct MEProgram { var staticSequences: [[Input.Element]] var staticBitsets: [DSLTree.CustomCharacterClass.AsciiBitset] var staticConsumeFunctions: [ConsumeFunction] - var staticAssertionFunctions: [AssertionFunction] var staticTransformFunctions: [TransformFunction] var staticMatcherFunctions: [MatcherFunction] diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 2be918294..55ac49ed9 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -9,6 +9,7 @@ // //===----------------------------------------------------------------------===// + enum MatchMode { case wholeString case partialFromFront @@ -238,7 +239,7 @@ extension Processor { } return true } - + func loadScalar() -> Unicode.Scalar? { currentPosition < end ? input.unicodeScalars[currentPosition] : nil } @@ -476,6 +477,17 @@ extension Processor { } } + case .matchBuiltin: + let payload = payload.characterClassPayload + if matchBuiltin( + payload.cc, + payload.isInverted, + payload.isStrictASCII, + payload.isScalarSemantics + ) { + controller.step() + } + case .consumeBy: let reg = payload.consumer guard currentPosition < searchBounds.upperBound, @@ -489,16 +501,9 @@ extension Processor { controller.step() case .assertBy: - let reg = payload.assertion - let assertion = registers[reg] + let payload = payload.assertion do { - guard try assertion( - &wordIndexCache, - &wordIndexMaxIndex, - input, - currentPosition, - subjectBounds - ) else { + guard try builtinAssert(by: payload) else { signalFailure() return } @@ -547,16 +552,14 @@ extension Processor { case .beginCapture: let capNum = Int( asserting: payload.capture.rawValue) + storedCaptures[capNum].startCapture(currentPosition) + controller.step() - storedCaptures[capNum].startCapture(currentPosition) - controller.step() - - case .endCapture: + case .endCapture: let capNum = Int( asserting: payload.capture.rawValue) - - storedCaptures[capNum].endCapture(currentPosition) - controller.step() + storedCaptures[capNum].endCapture(currentPosition) + controller.step() case .transformCapture: let (cap, trans) = payload.pairedCaptureTransform @@ -584,12 +587,6 @@ extension Processor { storedCaptures[capNum].registerValue( value, overwriteInitial: sp) controller.step() - - case .builtinAssertion: - builtinAssertion() - - case .builtinCharacterClass: - builtinCharacterClass() } } } diff --git a/Sources/_StringProcessing/Engine/Registers.swift b/Sources/_StringProcessing/Engine/Registers.swift index e5d33af8b..69cc3e30a 100644 --- a/Sources/_StringProcessing/Engine/Registers.swift +++ b/Sources/_StringProcessing/Engine/Registers.swift @@ -33,8 +33,6 @@ extension Processor { var consumeFunctions: [MEProgram.ConsumeFunction] - var assertionFunctions: [MEProgram.AssertionFunction] - // Captured-value constructors var transformFunctions: [MEProgram.TransformFunction] @@ -85,9 +83,6 @@ extension Processor.Registers { subscript(_ i: ConsumeFunctionRegister) -> MEProgram.ConsumeFunction { consumeFunctions[i.rawValue] } - subscript(_ i: AssertionFunctionRegister) -> MEProgram.AssertionFunction { - assertionFunctions[i.rawValue] - } subscript(_ i: TransformRegister) -> MEProgram.TransformFunction { transformFunctions[i.rawValue] } @@ -117,9 +112,6 @@ extension Processor.Registers { self.consumeFunctions = program.staticConsumeFunctions assert(consumeFunctions.count == info.consumeFunctions) - self.assertionFunctions = program.staticAssertionFunctions - assert(assertionFunctions.count == info.assertionFunctions) - self.transformFunctions = program.staticTransformFunctions assert(transformFunctions.count == info.transformFunctions) @@ -159,7 +151,6 @@ extension MEProgram { var strings = 0 var bitsets = 0 var consumeFunctions = 0 - var assertionFunctions = 0 var transformFunctions = 0 var matcherFunctions = 0 var ints = 0 diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index e56b8def2..d511c9f7c 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -122,18 +122,6 @@ extension MatchingOptions { } } -// Deprecated CharacterClass.MatchLevel API -extension MatchingOptions { - var matchLevel: _CharacterClassModel.MatchLevel { - switch semanticLevel { - case .graphemeCluster: - return .graphemeCluster - case .unicodeScalar: - return .unicodeScalar - } - } -} - // MARK: - Implementation extension MatchingOptions { /// An option that changes the behavior of a regular expression. diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index e60a1ce0e..953df6882 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -744,6 +744,41 @@ extension DSLTree.Atom.Assertion { } } +extension DSLTree.Atom.CharacterClass { + var _patternBase: String { + switch self { + case .anyGrapheme: + return ".anyGraphemeCluster" + case .anyUnicodeScalar: + return ".anyUnicodeScalar" + case .digit: + return ".digit" + case .notDigit: + return ".digit.inverted" + case .word: + return ".word" + case .notWord: + return ".word.inverted" + case .horizontalWhitespace: + return ".horizontalWhitespace" + case .notHorizontalWhitespace: + return ".horizontalWhitespace.inverted" + case .newlineSequence: + return ".newlineSequence" + case .notNewline: + return ".newlineSequence.inverted" + case .verticalWhitespace: + return ".verticalWhitespace" + case .notVerticalWhitespace: + return ".vertialWhitespace.inverted" + case .whitespace: + return ".whitespace" + case .notWhitespace: + return ".whitespace.inverted" + } + } +} + extension AST.Atom.CharacterProperty { var isUnprintableProperty: Bool { switch kind { @@ -1212,6 +1247,8 @@ extension DSLTree.Atom { case .assertion(let a): return (a._patternBase, false) + case .characterClass(let cc): + return (cc._patternBase, true) case .backreference(_): return ("/* TODO: backreferences */", false) @@ -1256,6 +1293,8 @@ extension DSLTree.Atom { case .assertion: return "/* TODO: assertions */" + case .characterClass: + return "/* TODO: character classes */" case .backreference: return "/* TODO: backreferences */" case .symbolicReference: diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 4eb7bc42c..f5b08dd6d 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -168,6 +168,25 @@ extension AST.Atom.EscapedBuiltin { default: return nil } } + var dslCharacterClass: DSLTree.Atom.CharacterClass? { + switch self { + case .decimalDigit: return .digit + case .notDecimalDigit: return .notDigit + case .horizontalWhitespace: return .horizontalWhitespace + case .notHorizontalWhitespace: return .notHorizontalWhitespace + case .newlineSequence: return .newlineSequence + case .notNewline: return .notNewline + case .whitespace: return .whitespace + case .notWhitespace: return .notWhitespace + case .verticalTab: return .verticalWhitespace + case .notVerticalTab: return .notVerticalWhitespace + case .wordCharacter: return .word + case .notWordCharacter: return .notWord + case .graphemeCluster: return .anyGrapheme + case .trueAnychar: return .anyUnicodeScalar + default: return nil + } + } } extension AST.Atom { @@ -179,6 +198,12 @@ extension AST.Atom { default: return nil } } + var dslCharacterClass: DSLTree.Atom.CharacterClass? { + switch kind { + case .escaped(let b): return b.dslCharacterClass + default: return nil + } + } } extension AST.Atom { @@ -186,6 +211,10 @@ extension AST.Atom { if let kind = dslAssertionKind { return .assertion(kind) } + + if let cc = dslCharacterClass { + return .characterClass(cc) + } switch self.kind { case let .char(c): return .char(c) @@ -194,9 +223,11 @@ extension AST.Atom { case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) - case .escaped(let c) where c.scalarValue != nil: - return .scalar(c.scalarValue!) - + case .escaped(let c): + guard let val = c.scalarValue else { + fatalError("Got a .escaped that was not an assertion, character class, or scalar value \(self)") + } + return .scalar(val) default: return .unconverted(.init(ast: self)) } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 520f4991a..0a0831706 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -177,6 +177,7 @@ extension DSLTree { /// newlines unless single line mode is enabled. case dot + case characterClass(CharacterClass) case assertion(Assertion) case backreference(_AST.Reference) case symbolicReference(ReferenceID) @@ -189,9 +190,9 @@ extension DSLTree { extension DSLTree.Atom { @_spi(RegexBuilder) - public enum Assertion: Hashable { + public enum Assertion: UInt64, Hashable { /// \A - case startOfSubject + case startOfSubject = 0 /// \Z case endOfSubjectBeforeNewline @@ -231,6 +232,46 @@ extension DSLTree.Atom { /// \B case notWordBoundary } + + @_spi(RegexBuilder) + public enum CharacterClass: Hashable { + case digit + case notDigit + case horizontalWhitespace + case notHorizontalWhitespace + case newlineSequence + case notNewline + case whitespace + case notWhitespace + case verticalWhitespace + case notVerticalWhitespace + case word + case notWord + case anyGrapheme + case anyUnicodeScalar + } +} + +extension DSLTree.Atom.CharacterClass { + @_spi(RegexBuilder) + public var inverted: DSLTree.Atom.CharacterClass? { + switch self { + case .anyGrapheme: return nil + case .anyUnicodeScalar: return nil + case .digit: return .notDigit + case .notDigit: return .digit + case .word: return .notWord + case .notWord: return .word + case .horizontalWhitespace: return .notHorizontalWhitespace + case .notHorizontalWhitespace: return .horizontalWhitespace + case .newlineSequence: return .notNewline + case .notNewline: return .newlineSequence + case .verticalWhitespace: return .notVerticalWhitespace + case .notVerticalWhitespace: return .verticalWhitespace + case .whitespace: return .notWhitespace + case .notWhitespace: return .whitespace + } + } } extension Unicode.GeneralCategory { @@ -767,34 +808,8 @@ extension DSLTree { internal var ast: AST.MatchingOptionSequence } - @_spi(RegexBuilder) public struct Atom { internal var ast: AST.Atom - - // FIXME: The below APIs should be removed once the DSL tree has been - // migrated to use proper DSL atoms for them. - - public static var _anyGrapheme: Self { - .init(ast: .init(.escaped(.graphemeCluster), .fake)) - } - public static var _whitespace: Self { - .init(ast: .init(.escaped(.whitespace), .fake)) - } - public static var _digit: Self { - .init(ast: .init(.escaped(.decimalDigit), .fake)) - } - public static var _horizontalWhitespace: Self { - .init(ast: .init(.escaped(.horizontalWhitespace), .fake)) - } - public static var _newlineSequence: Self { - .init(ast: .init(.escaped(.newlineSequence), .fake)) - } - public static var _verticalWhitespace: Self { - .init(ast: .init(.escaped(.verticalTab), .fake)) - } - public static var _word: Self { - .init(ast: .init(.escaped(.wordCharacter), .fake)) - } } } } @@ -808,7 +823,7 @@ extension DSLTree.Atom { case .changeMatchingOptions, .assertion: return false case .char, .scalar, .any, .anyNonNewline, .dot, .backreference, - .symbolicReference, .unconverted: + .symbolicReference, .unconverted, .characterClass: return true } } diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift index 94c311e82..50da079f6 100644 --- a/Sources/_StringProcessing/Unicode/WordBreaking.swift +++ b/Sources/_StringProcessing/Unicode/WordBreaking.swift @@ -12,6 +12,39 @@ @_spi(_Unicode) import Swift +extension Processor { + func atSimpleBoundary( + _ usesAsciiWord: Bool, + _ semanticLevel: MatchingOptions.SemanticLevel + ) -> Bool { + func matchesWord(at i: Input.Index) -> Bool { + switch semanticLevel { + case .graphemeCluster: + let c = input[i] + return c.isWordCharacter && (c.isASCII || !usesAsciiWord) + case .unicodeScalar: + let c = input.unicodeScalars[i] + return (c.properties.isAlphabetic || c == "_") && (c.isASCII || !usesAsciiWord) + } + } + + // FIXME: How should we handle bounds? + // We probably need two concepts + if subjectBounds.isEmpty { return false } + if currentPosition == subjectBounds.lowerBound { + return matchesWord(at: currentPosition) + } + let priorIdx = input.index(before: currentPosition) + if currentPosition == subjectBounds.upperBound { + return matchesWord(at: priorIdx) + } + + let prior = matchesWord(at: priorIdx) + let current = matchesWord(at: currentPosition) + return prior != current + } +} + extension String { func isOnWordBoundary( at i: String.Index, diff --git a/Sources/_StringProcessing/Utility/RegexFactory.swift b/Sources/_StringProcessing/Utility/RegexFactory.swift index 31245c0f7..e0df906fa 100644 --- a/Sources/_StringProcessing/Utility/RegexFactory.swift +++ b/Sources/_StringProcessing/Utility/RegexFactory.swift @@ -58,6 +58,14 @@ public struct _RegexFactory { ) -> Regex { .init(node: .atom(.scalar(scalar))) } + + @_spi(RegexBuilder) + @available(SwiftStdlib 5.7, *) + public func characterClass( + _ cc: DSLTree.Atom.CharacterClass + ) -> Regex { + .init(node: .atom(.characterClass(cc))) + } @_spi(RegexBuilder) @available(SwiftStdlib 5.7, *) diff --git a/Sources/_StringProcessing/Utility/TypedInt.swift b/Sources/_StringProcessing/Utility/TypedInt.swift index adc9edf78..e03f2572f 100644 --- a/Sources/_StringProcessing/Utility/TypedInt.swift +++ b/Sources/_StringProcessing/Utility/TypedInt.swift @@ -142,10 +142,6 @@ enum _AsciiBitsetRegister {} typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister> enum _ConsumeFunctionRegister {} -/// Used for assertion functions, e.g. anchors etc -typealias AssertionFunctionRegister = TypedInt<_AssertionFunctionRegister> -enum _AssertionFunctionRegister {} - /// Used for capture transforms, etc typealias TransformRegister = TypedInt<_TransformRegister> enum _TransformRegister {} diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 9f515f220..3be26f27f 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -17,27 +17,38 @@ struct _CharacterClassModel: Hashable { /// The actual character class to match. - var cc: Representation + let cc: Representation /// The level (character or Unicode scalar) at which to match. - var matchLevel: MatchLevel + let matchLevel: MatchingOptions.SemanticLevel + + /// If this character character class only matches ascii characters + let isStrictASCII: Bool /// Whether this character class matches against an inverse, /// e.g \D, \S, [^abc]. - var isInverted: Bool = false + let isInverted: Bool + + init( + cc: Representation, + options: MatchingOptions, + isInverted: Bool + ) { + self.cc = cc + self.matchLevel = options.semanticLevel + self.isStrictASCII = cc.isStrictAscii(options: options) + self.isInverted = isInverted + } - // TODO: Split out builtin character classes into their own type? - enum Representation: Hashable { + enum Representation: UInt64, Hashable { /// Any character - case any + case any = 0 /// Any grapheme cluster case anyGrapheme /// Any Unicode scalar case anyScalar /// Character.isDigit case digit - /// Character.isHexDigit - case hexDigit /// Horizontal whitespace: `[:blank:]`, i.e /// `[\p{gc=Space_Separator}\N{CHARACTER TABULATION}] case horizontalWhitespace @@ -50,43 +61,6 @@ struct _CharacterClassModel: Hashable { /// Character.isLetter or Character.isDigit or Character == "_" case word } - - enum MatchLevel: Hashable { - /// Match at the extended grapheme cluster level. - case graphemeCluster - /// Match at the Unicode scalar level. - case unicodeScalar - } - - var scalarSemantic: Self { - var result = self - result.matchLevel = .unicodeScalar - return result - } - - var graphemeClusterSemantic: Self { - var result = self - result.matchLevel = .graphemeCluster - return result - } - - /// Conditionally inverts a character class. - /// - /// - Parameter inversion: Indicates whether to invert the character class. - /// - Returns: The inverted character class if `inversion` is `true`; - /// otherwise, the same character class. - func withInversion(_ inversion: Bool) -> Self { - var copy = self - if inversion { - copy.isInverted.toggle() - } - return copy - } - - /// Inverts a character class. - var inverted: Self { - return withInversion(true) - } /// Returns the end of the match of this character class in the string. /// @@ -94,111 +68,106 @@ struct _CharacterClassModel: Hashable { /// - Parameter at: The index to start matching. /// - Parameter options: Options for the match operation. /// - Returns: The index of the end of the match, or `nil` if there is no match. - func matches(in str: String, at i: String.Index, with options: MatchingOptions) -> String.Index? { - switch matchLevel { - case .graphemeCluster: - let c = str[i] - var matched: Bool - var next = str.index(after: i) - switch cc { - case .any, .anyGrapheme: matched = true - case .anyScalar: - matched = true - next = str.unicodeScalars.index(after: i) - case .digit: - matched = c.isNumber && (c.isASCII || !options.usesASCIIDigits) - case .hexDigit: - matched = c.isHexDigit && (c.isASCII || !options.usesASCIIDigits) - case .horizontalWhitespace: - matched = c.unicodeScalars.first?.isHorizontalWhitespace == true - && (c.isASCII || !options.usesASCIISpaces) - case .newlineSequence, .verticalWhitespace: - matched = c.unicodeScalars.first?.isNewline == true - && (c.isASCII || !options.usesASCIISpaces) - case .whitespace: - matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces) - case .word: - matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord) + func matches( + in input: String, + at currentPosition: String.Index + ) -> String.Index? { + // FIXME: This is only called in custom character classes that contain builtin + // character classes as members (ie: [a\w] or set operations), is there + // any way to avoid that? Can we remove this somehow? + guard currentPosition != input.endIndex else { + return nil + } + let char = input[currentPosition] + let scalar = input.unicodeScalars[currentPosition] + let isScalarSemantics = matchLevel == .unicodeScalar + let asciiCheck = (char.isASCII && !isScalarSemantics) + || (scalar.isASCII && isScalarSemantics) + || !isStrictASCII + + var matched: Bool + var next: String.Index + switch (isScalarSemantics, cc) { + case (_, .anyGrapheme): + next = input.index(after: currentPosition) + case (_, .anyScalar): + // FIXME: This allows us to be not-scalar aligned when in grapheme mode + // Should this even be allowed? + next = input.unicodeScalars.index(after: currentPosition) + case (true, _): + next = input.unicodeScalars.index(after: currentPosition) + case (false, _): + next = input.index(after: currentPosition) + } + + switch cc { + case .any, .anyGrapheme, .anyScalar: + matched = true + case .digit: + if isScalarSemantics { + matched = scalar.properties.numericType != nil && asciiCheck + } else { + matched = char.isNumber && asciiCheck + } + case .horizontalWhitespace: + if isScalarSemantics { + matched = scalar.isHorizontalWhitespace && asciiCheck + } else { + matched = char._isHorizontalWhitespace && asciiCheck } - if isInverted { - matched.toggle() + case .verticalWhitespace: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + } else { + matched = char._isNewline && asciiCheck } - return matched ? next : nil - case .unicodeScalar: - let c = str.unicodeScalars[i] - var nextIndex = str.unicodeScalars.index(after: i) - var matched: Bool - switch cc { - case .any: matched = true - case .anyScalar: matched = true - case .anyGrapheme: - matched = true - nextIndex = str.index(after: i) - case .digit: - matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits) - case .hexDigit: - matched = Character(c).isHexDigit && (c.isASCII || !options.usesASCIIDigits) - case .horizontalWhitespace: - matched = c.isHorizontalWhitespace && (c.isASCII || !options.usesASCIISpaces) - case .verticalWhitespace: - matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) - case .newlineSequence: - matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) - if c == "\r" && nextIndex != str.endIndex && str.unicodeScalars[nextIndex] == "\n" { - str.unicodeScalars.formIndex(after: &nextIndex) + case .newlineSequence: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + if matched && scalar == "\r" + && next != input.endIndex && input.unicodeScalars[next] == "\n" { + // Match a full CR-LF sequence even in scalar sematnics + input.unicodeScalars.formIndex(after: &next) } - case .whitespace: - matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces) - case .word: - matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord) + } else { + matched = char._isNewline && asciiCheck + } + case .whitespace: + if isScalarSemantics { + matched = scalar.properties.isWhitespace && asciiCheck + } else { + matched = char.isWhitespace && asciiCheck } - if isInverted { - matched.toggle() + case .word: + if isScalarSemantics { + matched = scalar.properties.isAlphabetic && asciiCheck + } else { + matched = char.isWordCharacter && asciiCheck } - return matched ? nextIndex : nil + } + if isInverted { + matched.toggle() + } + if matched { + return next + } else { + return nil } } } -extension _CharacterClassModel { - static var any: _CharacterClassModel { - .init(cc: .any, matchLevel: .graphemeCluster) - } - - static var anyGrapheme: _CharacterClassModel { - .init(cc: .anyGrapheme, matchLevel: .graphemeCluster) - } - - static var anyUnicodeScalar: _CharacterClassModel { - .init(cc: .any, matchLevel: .unicodeScalar) - } - - static var whitespace: _CharacterClassModel { - .init(cc: .whitespace, matchLevel: .graphemeCluster) - } - - static var digit: _CharacterClassModel { - .init(cc: .digit, matchLevel: .graphemeCluster) - } - - static var hexDigit: _CharacterClassModel { - .init(cc: .hexDigit, matchLevel: .graphemeCluster) - } - - static var horizontalWhitespace: _CharacterClassModel { - .init(cc: .horizontalWhitespace, matchLevel: .graphemeCluster) - } - - static var newlineSequence: _CharacterClassModel { - .init(cc: .newlineSequence, matchLevel: .graphemeCluster) - } - - static var verticalWhitespace: _CharacterClassModel { - .init(cc: .verticalWhitespace, matchLevel: .graphemeCluster) - } - - static var word: _CharacterClassModel { - .init(cc: .word, matchLevel: .graphemeCluster) +extension _CharacterClassModel.Representation { + /// Returns true if this CharacterClass should be matched by strict ascii under the given options + func isStrictAscii(options: MatchingOptions) -> Bool { + switch self { + case .digit: return options.usesASCIIDigits + case .horizontalWhitespace: return options.usesASCIISpaces + case .newlineSequence: return options.usesASCIISpaces + case .verticalWhitespace: return options.usesASCIISpaces + case .whitespace: return options.usesASCIISpaces + case .word: return options.usesASCIIWord + default: return false + } } } @@ -209,7 +178,6 @@ extension _CharacterClassModel.Representation: CustomStringConvertible { case .anyGrapheme: return "" case .anyScalar: return "" case .digit: return "" - case .hexDigit: return "" case .horizontalWhitespace: return "" case .newlineSequence: return "" case .verticalWhitespace: return "vertical whitespace" @@ -225,102 +193,57 @@ extension _CharacterClassModel: CustomStringConvertible { } } -extension _CharacterClassModel { - func withMatchLevel( - _ level: _CharacterClassModel.MatchLevel - ) -> _CharacterClassModel { - var cc = self - cc.matchLevel = level - return cc - } -} - -extension AST.Atom { - var characterClass: _CharacterClassModel? { - switch kind { - case let .escaped(b): return b.characterClass - - case .property: - // TODO: Would our model type for character classes include - // this? Or does grapheme-semantic mode complicate that? - return nil - - case .dot: - // `.dot` is handled in the matching engine by Compiler.emitDot() and in - // the legacy compiler by the `.any` instruction, which can provide lower - // level instructions than the CharacterClass-generated consumer closure - // - // FIXME: We shouldn't be returning `nil` here, but instead fixing the call - // site to check for any before trying to construct a character class. - return nil - - default: return nil - - } - } - -} - -extension AST.Atom.EscapedBuiltin { - var characterClass: _CharacterClassModel? { +extension DSLTree.Atom.CharacterClass { + /// Converts this DSLTree CharacterClass into our runtime representation + func asRuntimeModel(_ options: MatchingOptions) -> _CharacterClassModel { + let cc: _CharacterClassModel.Representation + var inverted = false switch self { - case .decimalDigit: return .digit - case .notDecimalDigit: return .digit.inverted - - case .horizontalWhitespace: return .horizontalWhitespace + case .digit: + cc = .digit + case .notDigit: + cc = .digit + inverted = true + + case .horizontalWhitespace: + cc = .horizontalWhitespace case .notHorizontalWhitespace: - return .horizontalWhitespace.inverted + cc = .horizontalWhitespace + inverted = true - case .newlineSequence: return .newlineSequence + case .newlineSequence: + cc = .newlineSequence // FIXME: This is more like '.' than inverted '\R', as it is affected // by e.g (*CR). We should therefore really be emitting it through // emitDot(). For now we treat it as semantically invalid. - case .notNewline: return .newlineSequence.inverted - - case .whitespace: return .whitespace - case .notWhitespace: return .whitespace.inverted - - case .verticalTab: return .verticalWhitespace - case .notVerticalTab: return .verticalWhitespace.inverted - - case .wordCharacter: return .word - case .notWordCharacter: return .word.inverted - - case .graphemeCluster: return .anyGrapheme - case .trueAnychar: return .anyUnicodeScalar - - default: - return nil - } - } -} - -extension _CharacterClassModel { - // FIXME: Calling on inverted sets wont be the same as the - // inverse of a boundary if at the start or end of the - // string. (Think through what we want: do it ourselves or - // give the caller both options). - func isBoundary( - _ input: String, - at pos: String.Index, - bounds: Range, - with options: MatchingOptions - ) -> Bool { - // FIXME: How should we handle bounds? - // We probably need two concepts - if bounds.isEmpty { return false } - if pos == bounds.lowerBound { - return self.matches(in: input, at: pos, with: options) != nil - } - let priorIdx = input.index(before: pos) - if pos == bounds.upperBound { - return self.matches(in: input, at: priorIdx, with: options) != nil + case .notNewline: + cc = .newlineSequence + inverted = true + + case .whitespace: + cc = .whitespace + case .notWhitespace: + cc = .whitespace + inverted = true + + case .verticalWhitespace: + cc = .verticalWhitespace + case .notVerticalWhitespace: + cc = .verticalWhitespace + inverted = true + + case .word: + cc = .word + case .notWord: + cc = .word + inverted = true + + case .anyGrapheme: + cc = .anyGrapheme + case .anyUnicodeScalar: + cc = .anyScalar } - - let prior = self.matches(in: input, at: priorIdx, with: options) != nil - let current = self.matches(in: input, at: pos, with: options) != nil - return prior != current + return _CharacterClassModel(cc: cc, options: options, isInverted: inverted) } - } diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 27f8d79cb..e0702f87f 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -38,6 +38,7 @@ enum DecodedInstr { case matchScalarUnchecked case matchBitsetScalar case matchBitset + case matchBuiltin case consumeBy case assertBy case matchBy @@ -46,8 +47,6 @@ enum DecodedInstr { case endCapture case transformCapture case captureValue - case builtinAssertion - case builtinCharacterClass } extension DecodedInstr { @@ -56,87 +55,84 @@ extension DecodedInstr { /// /// Must stay in sync with Processor.cycle static func decode(_ instruction: Instruction) -> DecodedInstr { - let (opcode, payload) = instruction.destructure - - switch opcode { - case .invalid: - fatalError("Invalid program") - case .moveImmediate: - return .moveImmediate - case .moveCurrentPosition: - return .moveCurrentPosition - case .branch: - return .branch - case .condBranchZeroElseDecrement: - return .condBranchZeroElseDecrement - case .condBranchSamePosition: - return .condBranchSamePosition - case .save: - return .save - case .saveAddress: - return .saveAddress - case .splitSaving: - return .splitSaving - case .clear: - return .clear - case .clearThrough: - return .clearThrough - case .accept: - return .accept - case .fail: - return .fail - case .advance: - return .advance - case .match: - let (isCaseInsensitive, _) = payload.elementPayload - if isCaseInsensitive { - return .matchCaseInsensitive - } else { - return .match - } - case .matchScalar: - let (_, caseInsensitive, boundaryCheck) = payload.scalarPayload - if caseInsensitive { - if boundaryCheck { - return .matchScalarCaseInsensitive - } else { - return .matchScalarCaseInsensitiveUnchecked - } + let (opcode, payload) = instruction.destructure + switch opcode { + case .invalid: + fatalError("Invalid program") + case .moveImmediate: + return .moveImmediate + case .moveCurrentPosition: + return .moveCurrentPosition + case .branch: + return .branch + case .condBranchZeroElseDecrement: + return .condBranchZeroElseDecrement + case .condBranchSamePosition: + return .condBranchSamePosition + case .save: + return .save + case .saveAddress: + return .saveAddress + case .splitSaving: + return .splitSaving + case .clear: + return .clear + case .clearThrough: + return .clearThrough + case .accept: + return .accept + case .fail: + return .fail + case .advance: + return .advance + case .match: + let (isCaseInsensitive, _) = payload.elementPayload + if isCaseInsensitive { + return .matchCaseInsensitive + } else { + return .match + } + case .matchScalar: + let (_, caseInsensitive, boundaryCheck) = payload.scalarPayload + if caseInsensitive { + if boundaryCheck { + return .matchScalarCaseInsensitive } else { - if boundaryCheck { - return .matchScalar - } else { - return .matchScalarUnchecked - } + return .matchScalarCaseInsensitiveUnchecked } - case .matchBitset: - let (isScalar, _) = payload.bitsetPayload - if isScalar { - return .matchBitsetScalar + } else { + if boundaryCheck { + return .matchScalar } else { - return .matchBitset + return .matchScalarUnchecked } - case .consumeBy: - return consumeBy - case .assertBy: - return .assertBy - case .matchBy: - return .matchBy - case .backreference: - return .backreference - case .beginCapture: - return .beginCapture - case .endCapture: - return .endCapture - case .transformCapture: - return .transformCapture - case .captureValue: - return .captureValue - case .builtinAssertion: - return .builtinAssertion - case .builtinCharacterClass: - return .builtinCharacterClass -} + } + case .matchBitset: + let (isScalar, _) = payload.bitsetPayload + if isScalar { + return .matchBitsetScalar + } else { + return .matchBitset + } + case .consumeBy: + return consumeBy + case .assertBy: + return .assertBy + case .matchBy: + return .matchBy + case .backreference: + return .backreference + case .beginCapture: + return .beginCapture + case .endCapture: + return .endCapture + case .transformCapture: + return .transformCapture + case .captureValue: + return .captureValue + case .matchBuiltin: + return .matchBuiltin + } } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 5f4c8bb30..ff51088ff 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1517,13 +1517,10 @@ extension RegexTests { (" 123", "23"), ("123 456", "23")) - // TODO: \G and \K - do { - let regex = try Regex(#"\Gab"#, as: Substring.self) - XCTExpectFailure { - XCTAssertEqual("abab".matches(of: regex).map(\.output), ["ab", "ab"]) - } - } + // \G and \K + let regex = try Regex(#"\Gab"#, as: Substring.self) + XCTAssertEqual("abab".matches(of: regex).map(\.output), ["ab", "ab"]) + // TODO: Oniguruma \y and \Y firstMatchTests(