From a79f1cb0c3061c6650825583b0ec2525d6bb3eed Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 26 Mar 2024 21:09:23 -0500 Subject: [PATCH] Remove most consumer functions (#660) This is based heavily off the work in #590, rebased onto main, with some changes to remove even more consumer uses. Consumer functions only have two remaining uses: non-ASCII ranges and Unicode lookups (for things like general category, binary properties, name, etc.). This change primarily treats custom character classes as alternations around their contents, with set operations emitted as instructions instead of implemented via consumer function. --- Sources/_StringProcessing/ByteCodeGen.swift | 296 +++++++++++++++++- .../_StringProcessing/ConsumerInterface.swift | 270 +--------------- .../Engine/Instruction.swift | 8 + .../_StringProcessing/Engine/MEBuilder.swift | 8 + .../_StringProcessing/Engine/Processor.swift | 27 +- Sources/_StringProcessing/Regex/DSLTree.swift | 28 ++ Tests/RegexTests/CompileTests.swift | 45 +++ 7 files changed, 399 insertions(+), 283 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 931e62bbd..3cf6bd6f2 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -256,9 +256,11 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitAlternation( - _ children: [DSLTree.Node] - ) throws { + mutating func emitAlternationGen( + _ elements: C, + withBacktracking: Bool, + _ body: (inout Compiler.ByteCodeGen, C.Element) throws -> Void + ) rethrows { // Alternation: p0 | p1 | ... | pn // save next_p1 // @@ -276,16 +278,27 @@ fileprivate extension Compiler.ByteCodeGen { // // done: let done = builder.makeAddress() - for component in children.dropLast() { + for element in elements.dropLast() { let next = builder.makeAddress() builder.buildSave(next) - try emitNode(component) + try body(&self, element) + if !withBacktracking { + builder.buildClear() + } builder.buildBranch(to: done) builder.label(next) } - try emitNode(children.last!) + try body(&self, elements.last!) builder.label(done) } + + mutating func emitAlternation( + _ children: [DSLTree.Node] + ) throws { + try emitAlternationGen(children, withBacktracking: true) { + try $0.emitNode($1) + } + } mutating func emitConcatenationComponent( _ node: DSLTree.Node @@ -846,6 +859,36 @@ fileprivate extension Compiler.ByteCodeGen { } } + /// Flatten quoted strings into sequences of atoms, so that the standard + /// CCC codegen will handle them. + func flatteningCustomCharacterClassMembers( + _ members: [DSLTree.CustomCharacterClass.Member] + ) -> [DSLTree.CustomCharacterClass.Member] { + var characters: Set = [] + var scalars: Set = [] + var result: [DSLTree.CustomCharacterClass.Member] = [] + for member in members { + switch member { + case .atom(let atom): + switch atom { + case let .char(char): + characters.insert(char) + case let .scalar(scalar): + scalars.insert(scalar) + default: + result.append(member) + } + case let .quotedLiteral(str): + characters.formUnion(str) + default: + result.append(member) + } + } + result.append(contentsOf: characters.map { .atom(.char($0)) }) + result.append(contentsOf: scalars.map { .atom(.scalar($0)) }) + return result + } + func coalescingCustomCharacterClass( _ ccc: DSLTree.CustomCharacterClass ) -> DSLTree.CustomCharacterClass { @@ -853,12 +896,150 @@ fileprivate extension Compiler.ByteCodeGen { // mode, we don't want to coalesce any scalars into a grapheme. This // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and // U+302. - guard options.semanticLevel == .graphemeCluster else { return ccc } - - let members = coalescingCustomCharacterClassMembers(ccc.members) - return .init(members: members, isInverted: ccc.isInverted) + let members = options.semanticLevel == .graphemeCluster + ? coalescingCustomCharacterClassMembers(ccc.members) + : ccc.members + return .init( + members: flatteningCustomCharacterClassMembers(members), + isInverted: ccc.isInverted) } + mutating func emitCharacterInCCC(_ c: Character) { + switch options.semanticLevel { + case .graphemeCluster: + emitCharacter(c) + case .unicodeScalar: + // When in scalar mode, act like an alternation of the individual scalars + // that comprise a character. + emitAlternationGen(c.unicodeScalars, withBacktracking: false) { + $0.emitMatchScalar($1) + } + } + } + + mutating func emitCCCMember( + _ member: DSLTree.CustomCharacterClass.Member + ) throws { + switch member { + case .atom(let atom): + switch atom { + case .char(let c): + emitCharacterInCCC(c) + case .scalar(let s): + emitCharacterInCCC(Character(s)) + default: + try emitAtom(atom) + } + case .custom(let ccc): + try emitCustomCharacterClass(ccc) + case .quotedLiteral: + fatalError("Removed in 'flatteningCustomCharacterClassMembers'") + case .range: + let consumer = try member.generateConsumer(options) + builder.buildConsume(by: consumer) + case .trivia: + return + + // TODO: Can we decide when it's better to try `rhs` first? + // Intersection is trivial, since failure on either side propagates: + // - store current position + // - lhs + // - restore current position + // - rhs + case let .intersection(lhs, rhs): + let r = builder.makePositionRegister() + builder.buildMoveCurrentPosition(into: r) + try emitCustomCharacterClass(lhs) + builder.buildRestorePosition(from: r) + try emitCustomCharacterClass(rhs) + + // TODO: Can we decide when it's better to try `rhs` first? + // For subtraction, failure in `lhs` propagates, while failure in `rhs` is + // swallowed/reversed: + // - store current position + // - lhs + // - save to end + // - restore current position + // - rhs + // - clear, fail (since both succeeded) + // - end: ... + case let .subtraction(lhs, rhs): + let r = builder.makePositionRegister() + let end = builder.makeAddress() + builder.buildMoveCurrentPosition(into: r) + try emitCustomCharacterClass(lhs) // no match here = failure, propagates + builder.buildSave(end) + builder.buildRestorePosition(from: r) + try emitCustomCharacterClass(rhs) // no match here = success, resumes at 'end' + builder.buildClear() // clears 'end' + builder.buildFail() // this failure propagates outward + builder.label(end) + + // Symmetric difference always requires executing both `rhs` and `lhs`. + // Execute each, ignoring failure and storing the resulting position in a + // register. If those results are equal, fail. If they're different, use + // the position that is different from the starting position: + // - store current position as r0 + // - save to lhsFail + // - lhs + // - clear lhsFail (and continue) + // - lhsFail: save position as r1 + // + // - restore current position + // - save to rhsFail + // - rhs + // - clear rhsFail (and continue) + // - rhsFail: save position as r2 + // + // - restore to resulting position from lhs (r1) + // - if equal to r2, goto fail (both sides had same result) + // - if equal to r0, goto advance (lhs failed) + // - goto end + // - advance: restore to resulting position from rhs (r2) + // - goto end + // - fail: fail + // - end: ... + case let .symmetricDifference(lhs, rhs): + let r0 = builder.makePositionRegister() + let r1 = builder.makePositionRegister() + let r2 = builder.makePositionRegister() + let lhsFail = builder.makeAddress() + let rhsFail = builder.makeAddress() + let advance = builder.makeAddress() + let fail = builder.makeAddress() + let end = builder.makeAddress() + + builder.buildMoveCurrentPosition(into: r0) + builder.buildSave(lhsFail) + try emitCustomCharacterClass(lhs) + builder.buildClear() + builder.label(lhsFail) + builder.buildMoveCurrentPosition(into: r1) + + builder.buildRestorePosition(from: r0) + builder.buildSave(rhsFail) + try emitCustomCharacterClass(rhs) + builder.buildClear() + builder.label(rhsFail) + builder.buildMoveCurrentPosition(into: r2) + + // If r1 == r2, then fail + builder.buildRestorePosition(from: r1) + builder.buildCondBranch(to: fail, ifSamePositionAs: r2) + + // If r1 == r0, then move to r2 before ending + builder.buildCondBranch(to: advance, ifSamePositionAs: r0) + builder.buildBranch(to: end) + builder.label(advance) + builder.buildRestorePosition(from: r2) + builder.buildBranch(to: end) + + builder.label(fail) + builder.buildFail() + builder.label(end) + } + } + mutating func emitCustomCharacterClass( _ ccc: DSLTree.CustomCharacterClass ) throws { @@ -876,8 +1057,67 @@ fileprivate extension Compiler.ByteCodeGen { } return } - let consumer = try ccc.generateConsumer(options) - builder.buildConsume(by: consumer) + + let updatedCCC: DSLTree.CustomCharacterClass + if optimizationsEnabled { + updatedCCC = ccc.coalescingASCIIMembers(options) + } else { + updatedCCC = ccc + } + let filteredMembers = updatedCCC.members.filter({!$0.isOnlyTrivia}) + + if updatedCCC.isInverted { + // inverted + // custom character class: p0 | p1 | ... | pn + // Try each member to make sure they all fail + // save next_p1 + // + // clear, fail + // next_p1: + // save next_p2 + // + // clear fail + // next_p2: + // save next_p... + // + // clear fail + // ... + // next_pn: + // save done + // + // clear fail + // done: + // step forward by 1 + let done = builder.makeAddress() + for member in filteredMembers.dropLast() { + let next = builder.makeAddress() + builder.buildSave(next) + try emitCCCMember(member) + builder.buildClear() + builder.buildFail() + builder.label(next) + } + builder.buildSave(done) + try emitCCCMember(filteredMembers.last!) + builder.buildClear() + builder.buildFail() + builder.label(done) + + // Consume a single unit for the inverted ccc + switch options.semanticLevel { + case .graphemeCluster: + builder.buildAdvance(1) + case .unicodeScalar: + builder.buildAdvanceUnicodeScalar(1) + } + return + } + // non inverted CCC + // Custom character class: p0 | p1 | ... | pn + // Very similar to alternation, but we don't keep backtracking save points + try emitAlternationGen(filteredMembers, withBacktracking: false) { + try $0.emitCCCMember($1) + } } mutating func emitConcatenation(_ children: [DSLTree.Node]) throws { @@ -1014,6 +1254,12 @@ fileprivate extension Compiler.ByteCodeGen { } extension DSLTree.Node { + /// A Boolean value indicating whether this node advances the match position + /// on a successful match. + /// + /// For example, an alternation like `(a|b|c)` always advances the position + /// by a character, but `(a|b|)` has an empty branch, which matches without + /// advancing. var guaranteesForwardProgress: Bool { switch self { case .orderedChoice(let children): @@ -1044,8 +1290,8 @@ extension DSLTree.Node { case .consumer, .matcher: // Allow zero width consumers and matchers return false - case .customCharacterClass: - return true + case .customCharacterClass(let ccc): + return ccc.guaranteesForwardProgress case .quantification(let amount, _, let child): let (atLeast, _) = amount.ast.bounds return atLeast ?? 0 > 0 && child.guaranteesForwardProgress @@ -1053,3 +1299,25 @@ extension DSLTree.Node { } } } + +extension DSLTree.CustomCharacterClass { + /// We allow trivia into CustomCharacterClass, which could result in a CCC + /// that matches nothing, ie `(?x)[ ]`. + var guaranteesForwardProgress: Bool { + for m in members { + switch m { + case .trivia: + continue + case let .intersection(lhs, rhs): + return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress + case let .subtraction(lhs, _): + return lhs.guaranteesForwardProgress + case let .symmetricDifference(lhs, rhs): + return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress + default: + return true + } + } + return false + } +} diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 99d1b3cae..898985a4d 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -18,89 +18,12 @@ extension Character { } } -extension DSLTree.Node { - /// Attempt to generate a consumer from this AST node - /// - /// A consumer is a Swift closure that matches against - /// the front of an input range - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction? { - switch self { - case .atom(let a): - return try a.generateConsumer(opts) - case .customCharacterClass(let ccc): - return try ccc.generateConsumer(opts) - - case .quotedLiteral: - // TODO: Should we handle this here? - return nil - - case let .convertedRegexLiteral(n, _): - return try n.generateConsumer(opts) - - case .orderedChoice, .conditional, .concatenation, - .capture, .nonCapturingGroup, - .quantification, .trivia, .empty, - .ignoreCapturesInTypedOutput, .absentFunction: return nil - - case .consumer: - fatalError("FIXME: Is this where we handle them?") - case .matcher: - fatalError("FIXME: Is this where we handle them?") - case .characterPredicate: - fatalError("FIXME: Is this where we handle them?") - } - } -} - extension DSLTree._AST.Atom { var singleScalarASCIIValue: UInt8? { return ast.singleScalarASCIIValue } } -extension Character { - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { - let isCaseInsensitive = opts.isCaseInsensitive - switch opts.semanticLevel { - case .graphemeCluster: - return { input, bounds in - guard let (char, next) = input.characterAndEnd( - at: bounds.lowerBound, limitedBy: bounds.upperBound) - else { return nil } - if isCaseInsensitive && isCased { - return char.lowercased() == self.lowercased() - ? next - : nil - } else { - return char == self - ? next - : nil - } - } - case .unicodeScalar: - // TODO: This should only be reachable from character class emission, can - // we guarantee that? Otherwise we'd want a different matching behavior. - let consumers = unicodeScalars.map { s in consumeScalar { - isCaseInsensitive - ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping - : $0 == s - }} - return { input, bounds in - for fn in consumers { - if let idx = fn(input, bounds) { - return idx - } - } - return nil - } - } - } -} - extension DSLTree.Atom { var singleScalarASCIIValue: UInt8? { switch self { @@ -114,85 +37,6 @@ extension DSLTree.Atom { return nil } } - - // TODO: If ByteCodeGen switches first, then this is unnecessary for - // top-level nodes, but it's also invoked for `.atom` members of a custom CC - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction? { - switch self { - case let .char(c): - return try c.generateConsumer(opts) - - case let .scalar(s): - // A scalar always matches the same as a single scalar character. This - // means it must match a whole grapheme in grapheme semantic mode, but - // can match a single scalar in scalar semantic mode. - return try Character(s).generateConsumer(opts) - - case .any: - // FIXME: Should this be a total ordering? - if opts.semanticLevel == .graphemeCluster { - return { input, bounds in - input.index(after: bounds.lowerBound) - } - } else { - return consumeScalar { _ in - true - } - } - - case .anyNonNewline: - switch opts.semanticLevel { - case .graphemeCluster: - return { input, bounds in - input[bounds.lowerBound].isNewline - ? nil - : input.index(after: bounds.lowerBound) - } - case .unicodeScalar: - return { input, bounds in - input[bounds.lowerBound].isNewline - ? nil - : input.unicodeScalars.index(after: bounds.lowerBound) - } - } - - case .dot: - throw Unreachable(".atom(.dot) should be handled by emitDot") - - case .assertion: - // TODO: We could handle, should this be total? - return nil - case .characterClass(let cc): - return cc.generateConsumer(opts) - - case .backreference: - // TODO: Should we handle? - return nil - - case .symbolicReference: - // TODO: Should we handle? - return nil - - case .changeMatchingOptions: - // TODO: Should we handle? - return nil - - case let .unconverted(a): - return try a.ast.generateConsumer(opts) - } - - } -} - -extension DSLTree.Atom.CharacterClass { - func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction { - let model = asRuntimeModel(opts) - return { input, bounds in - model.matches(in: input, at: bounds.lowerBound, limitedBy: bounds.upperBound) - } - } } extension String { @@ -283,47 +127,20 @@ extension AST.Atom { _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { switch kind { - case let .scalar(s): - assertionFailure( - "Should have been handled by tree conversion") - return consumeScalar { $0 == s.value } - - case let .char(c): - assertionFailure( - "Should have been handled by tree conversion") - - // TODO: Match level? - return { input, bounds in - let low = bounds.lowerBound - guard input[low] == c else { - return nil - } - return input.index(after: low) - } - case let .property(p): return try p.generateConsumer(opts) case let .namedCharacter(name): return consumeName(name, opts: opts) - case .dot: - assertionFailure( - "Should have been handled by tree conversion") - fatalError(".atom(.dot) is handled in emitDot") - - case .caretAnchor, .dollarAnchor: - // handled in emitAssertion - return nil - case .escaped: - // handled in emitAssertion and emitCharacterClass - return nil - case .scalarSequence, .keyboardControl, .keyboardMeta, .keyboardMetaControl, .backreference, .subpattern, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: // FIXME: implement return nil + + case .char, .scalar, .escaped, .dot, .caretAnchor, .dollarAnchor: + fatalError("Handled in ByteCodeGen or earlier.") #if RESILIENT_LIBRARIES @unknown default: @@ -367,11 +184,6 @@ extension DSLTree.CustomCharacterClass.Member { _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { switch self { - case let .atom(a): - guard let c = try a.generateConsumer(opts) else { - throw Unsupported("Consumer for \(a)") - } - return c case let .range(low, high): guard let lhsChar = low.literalCharacterValue else { throw Unsupported("\(low) in range") @@ -437,57 +249,12 @@ extension DSLTree.CustomCharacterClass.Member { return nil } - case let .custom(ccc): - return try ccc.generateConsumer(opts) - - case let .intersection(lhs, rhs): - let lhs = try lhs.generateConsumer(opts) - let rhs = try rhs.generateConsumer(opts) - return { input, bounds in - if let lhsIdx = lhs(input, bounds), - let rhsIdx = rhs(input, bounds) - { - guard lhsIdx == rhsIdx else { - fatalError("TODO: What should we do here?") - } - return lhsIdx - } - return nil - } + case .atom, .custom, .intersection, .subtraction, .symmetricDifference: + fatalError("Handled in 'emitCustomCharacterClass'") - case let .subtraction(lhs, rhs): - let lhs = try lhs.generateConsumer(opts) - let rhs = try rhs.generateConsumer(opts) - return { input, bounds in - if let lhsIdx = lhs(input, bounds), - rhs(input, bounds) == nil - { - return lhsIdx - } - return nil - } + case .quotedLiteral: + fatalError("Removed in 'flatteningCustomCharacterClassMembers'") - case let .symmetricDifference(lhs, rhs): - let lhs = try lhs.generateConsumer(opts) - let rhs = try rhs.generateConsumer(opts) - return { input, bounds in - if let lhsIdx = lhs(input, bounds) { - return rhs(input, bounds) == nil ? lhsIdx : nil - } - return rhs(input, bounds) - } - case .quotedLiteral(let str): - let consumers = try str.map { - try $0.generateConsumer(opts) - } - return { input, bounds in - for fn in consumers { - if let idx = fn(input, bounds) { - return idx - } - } - return nil - } case .trivia: // TODO: Should probably strip this earlier... return { _, _ in nil } @@ -508,28 +275,6 @@ extension DSLTree.CustomCharacterClass { } ) } - - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { - // NOTE: Easy way to implement, obviously not performant - let consumers = try members.map { - try $0.generateConsumer(opts) - } - return { input, bounds in - for consumer in consumers { - if let idx = consumer(input, bounds) { - return isInverted ? nil : idx - } - } - if isInverted { - return opts.semanticLevel == .graphemeCluster - ? Swift.min(input.index(after: bounds.lowerBound), bounds.upperBound) - : input.unicodeScalars.index(after: bounds.lowerBound) - } - return nil - } - } } // NOTE: Conveniences, though not most performant @@ -638,7 +383,6 @@ extension AST.Atom.CharacterProperty { case .generalCategory(let p): return try p.generateConsumer(opts) -// fatalError("TODO: Map categories: \(p)") case .binary(let prop, value: let value): let cons = try prop.generateConsumer(opts) diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 21ab90a03..011174660 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -44,6 +44,14 @@ extension Instruction { /// Operands: /// - Position register to move into case moveCurrentPosition + + /// Set the current position to the value stored in the register + /// + /// restorePosition(from: PositionRegister) + /// + /// Operands: + /// - Position register to read from + case restorePosition // MARK: General Purpose: Control flow diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 44c938e71..1f5611f98 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -84,6 +84,10 @@ extension MEProgram.Builder { .init(instructions.endIndex - 1) } + mutating func buildFatalError() { + instructions.append(.init(.invalid)) + } + mutating func buildMoveImmediate( _ value: UInt64, into: IntRegister ) { @@ -315,6 +319,10 @@ extension MEProgram.Builder { instructions.append(.init(.moveCurrentPosition, .init(position: r))) } + mutating func buildRestorePosition(from r: PositionRegister) { + instructions.append(.init(.restorePosition, .init(position: r))) + } + mutating func buildBackreference( _ cap: CaptureRegister, isScalarMode: Bool diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 86365322b..0de7c2f70 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -175,14 +175,25 @@ extension Processor { // save point was restored mutating func consume(_ n: Distance) -> Bool { // TODO: needs benchmark coverage - guard let idx = input.index( + if let idx = input.index( currentPosition, offsetBy: n.rawValue, limitedBy: end - ) else { - signalFailure() - return false + ) { + currentPosition = idx + return true } - currentPosition = idx - return true + + // If `end` falls in the middle of a character, and we are trying to advance + // by one "character", then we should max out at `end` even though the above + // advancement will result in `nil`. + if n == 1, let idx = input.unicodeScalars.index( + currentPosition, offsetBy: n.rawValue, limitedBy: end + ) { + currentPosition = idx + return true + } + + signalFailure() + return false } // Advances in unicode scalar view @@ -413,6 +424,10 @@ extension Processor { let reg = payload.position registers[reg] = currentPosition controller.step() + case .restorePosition: + let reg = payload.position + currentPosition = registers[reg] + controller.step() case .branch: controller.pc = payload.addr diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 9f0e2df23..2cc4d032f 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -131,6 +131,23 @@ extension DSLTree { } } + func coalescingASCIIMembers(_ opts: MatchingOptions) -> CustomCharacterClass { + var ascii: [Member] = [] + var nonAscii: [Member] = [] + for member in members { + if member.asAsciiBitset(opts, false) != nil { + ascii.append(member) + } else { + nonAscii.append(member) + } + } + if ascii.isEmpty || nonAscii.isEmpty { return self } + return CustomCharacterClass(members: [ + .custom(CustomCharacterClass(members: ascii)), + .custom(CustomCharacterClass(members: nonAscii)) + ], isInverted: isInverted) + } + public init(members: [DSLTree.CustomCharacterClass.Member], isInverted: Bool = false) { self.members = members self.isInverted = isInverted @@ -161,6 +178,17 @@ extension DSLTree { indirect case intersection(CustomCharacterClass, CustomCharacterClass) indirect case subtraction(CustomCharacterClass, CustomCharacterClass) indirect case symmetricDifference(CustomCharacterClass, CustomCharacterClass) + + var isOnlyTrivia: Bool { + switch self { + case .custom(let ccc): + return ccc.members.all(\.isOnlyTrivia) + case .trivia: + return true + default: + return false + } + } } } diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index aafe752bc..d0500847b 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -19,6 +19,7 @@ enum DecodedInstr { case invalid case moveImmediate case moveCurrentPosition + case restorePosition case branch case condBranchZeroElseDecrement case condBranchSamePosition @@ -65,6 +66,8 @@ extension DecodedInstr { return .moveImmediate case .moveCurrentPosition: return .moveCurrentPosition + case .restorePosition: + return .restorePosition case .branch: return .branch case .condBranchZeroElseDecrement: @@ -368,6 +371,48 @@ extension RegexTests { semanticLevel: .unicodeScalar, contains: [.matchBitsetScalar], doesNotContain: [.matchBitset, .consumeBy]) + expectProgram( + for: "[a-c]", + contains: [.matchBitset], + doesNotContain: [.consumeBy, .matchBitsetScalar]) + expectProgram( + for: "[a-c0123]", + contains: [.matchBitset], + doesNotContain: [.matchBitsetScalar, .consumeBy]) + expectProgram( + for: #"\w"#, + contains: [.matchBuiltin], + doesNotContain: [.consumeBy, .matchBitset, .matchBitsetScalar]) + expectProgram( + for: #"[\w]"#, + contains: [.matchBuiltin], + doesNotContain: [.consumeBy, .matchBitset, .matchBitsetScalar]) + expectProgram( + for: #"\w"#, + semanticLevel: .unicodeScalar, + contains: [.matchBuiltin], + doesNotContain: [.consumeBy, .matchBitset, .matchBitsetScalar]) + expectProgram( + for: #"[\w]"#, + semanticLevel: .unicodeScalar, + contains: [.matchBuiltin], + doesNotContain: [.consumeBy, .matchBitset, .matchBitsetScalar]) + expectProgram( + for: #"\p{Greek}"#, + contains: [.consumeBy], + doesNotContain: [.matchBuiltin, .matchBitset, .matchBitsetScalar]) + + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + + expectProgram( + for: "[a-á]", + contains: [.consumeBy], + doesNotContain: [.matchBitset, .matchBitsetScalar]) + expectProgram( + for: "[a-fá-ém-zk]", + contains: [.matchBitset, .consumeBy], + doesNotContain: [.matchBitsetScalar]) } func testScalarOptimizeCompilation() {