diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 931e62bbd..3cf6bd6f2 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -256,9 +256,11 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitAlternation( - _ children: [DSLTree.Node] - ) throws { + mutating func emitAlternationGen( + _ elements: C, + withBacktracking: Bool, + _ body: (inout Compiler.ByteCodeGen, C.Element) throws -> Void + ) rethrows { // Alternation: p0 | p1 | ... | pn // save next_p1 // @@ -276,16 +278,27 @@ fileprivate extension Compiler.ByteCodeGen { // // done: let done = builder.makeAddress() - for component in children.dropLast() { + for element in elements.dropLast() { let next = builder.makeAddress() builder.buildSave(next) - try emitNode(component) + try body(&self, element) + if !withBacktracking { + builder.buildClear() + } builder.buildBranch(to: done) builder.label(next) } - try emitNode(children.last!) + try body(&self, elements.last!) builder.label(done) } + + mutating func emitAlternation( + _ children: [DSLTree.Node] + ) throws { + try emitAlternationGen(children, withBacktracking: true) { + try $0.emitNode($1) + } + } mutating func emitConcatenationComponent( _ node: DSLTree.Node @@ -846,6 +859,36 @@ fileprivate extension Compiler.ByteCodeGen { } } + /// Flatten quoted strings into sequences of atoms, so that the standard + /// CCC codegen will handle them. + func flatteningCustomCharacterClassMembers( + _ members: [DSLTree.CustomCharacterClass.Member] + ) -> [DSLTree.CustomCharacterClass.Member] { + var characters: Set = [] + var scalars: Set = [] + var result: [DSLTree.CustomCharacterClass.Member] = [] + for member in members { + switch member { + case .atom(let atom): + switch atom { + case let .char(char): + characters.insert(char) + case let .scalar(scalar): + scalars.insert(scalar) + default: + result.append(member) + } + case let .quotedLiteral(str): + characters.formUnion(str) + default: + result.append(member) + } + } + result.append(contentsOf: characters.map { .atom(.char($0)) }) + result.append(contentsOf: scalars.map { .atom(.scalar($0)) }) + return result + } + func coalescingCustomCharacterClass( _ ccc: DSLTree.CustomCharacterClass ) -> DSLTree.CustomCharacterClass { @@ -853,12 +896,150 @@ fileprivate extension Compiler.ByteCodeGen { // mode, we don't want to coalesce any scalars into a grapheme. This // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and // U+302. - guard options.semanticLevel == .graphemeCluster else { return ccc } - - let members = coalescingCustomCharacterClassMembers(ccc.members) - return .init(members: members, isInverted: ccc.isInverted) + let members = options.semanticLevel == .graphemeCluster + ? coalescingCustomCharacterClassMembers(ccc.members) + : ccc.members + return .init( + members: flatteningCustomCharacterClassMembers(members), + isInverted: ccc.isInverted) } + mutating func emitCharacterInCCC(_ c: Character) { + switch options.semanticLevel { + case .graphemeCluster: + emitCharacter(c) + case .unicodeScalar: + // When in scalar mode, act like an alternation of the individual scalars + // that comprise a character. + emitAlternationGen(c.unicodeScalars, withBacktracking: false) { + $0.emitMatchScalar($1) + } + } + } + + mutating func emitCCCMember( + _ member: DSLTree.CustomCharacterClass.Member + ) throws { + switch member { + case .atom(let atom): + switch atom { + case .char(let c): + emitCharacterInCCC(c) + case .scalar(let s): + emitCharacterInCCC(Character(s)) + default: + try emitAtom(atom) + } + case .custom(let ccc): + try emitCustomCharacterClass(ccc) + case .quotedLiteral: + fatalError("Removed in 'flatteningCustomCharacterClassMembers'") + case .range: + let consumer = try member.generateConsumer(options) + builder.buildConsume(by: consumer) + case .trivia: + return + + // TODO: Can we decide when it's better to try `rhs` first? + // Intersection is trivial, since failure on either side propagates: + // - store current position + // - lhs + // - restore current position + // - rhs + case let .intersection(lhs, rhs): + let r = builder.makePositionRegister() + builder.buildMoveCurrentPosition(into: r) + try emitCustomCharacterClass(lhs) + builder.buildRestorePosition(from: r) + try emitCustomCharacterClass(rhs) + + // TODO: Can we decide when it's better to try `rhs` first? + // For subtraction, failure in `lhs` propagates, while failure in `rhs` is + // swallowed/reversed: + // - store current position + // - lhs + // - save to end + // - restore current position + // - rhs + // - clear, fail (since both succeeded) + // - end: ... + case let .subtraction(lhs, rhs): + let r = builder.makePositionRegister() + let end = builder.makeAddress() + builder.buildMoveCurrentPosition(into: r) + try emitCustomCharacterClass(lhs) // no match here = failure, propagates + builder.buildSave(end) + builder.buildRestorePosition(from: r) + try emitCustomCharacterClass(rhs) // no match here = success, resumes at 'end' + builder.buildClear() // clears 'end' + builder.buildFail() // this failure propagates outward + builder.label(end) + + // Symmetric difference always requires executing both `rhs` and `lhs`. + // Execute each, ignoring failure and storing the resulting position in a + // register. If those results are equal, fail. If they're different, use + // the position that is different from the starting position: + // - store current position as r0 + // - save to lhsFail + // - lhs + // - clear lhsFail (and continue) + // - lhsFail: save position as r1 + // + // - restore current position + // - save to rhsFail + // - rhs + // - clear rhsFail (and continue) + // - rhsFail: save position as r2 + // + // - restore to resulting position from lhs (r1) + // - if equal to r2, goto fail (both sides had same result) + // - if equal to r0, goto advance (lhs failed) + // - goto end + // - advance: restore to resulting position from rhs (r2) + // - goto end + // - fail: fail + // - end: ... + case let .symmetricDifference(lhs, rhs): + let r0 = builder.makePositionRegister() + let r1 = builder.makePositionRegister() + let r2 = builder.makePositionRegister() + let lhsFail = builder.makeAddress() + let rhsFail = builder.makeAddress() + let advance = builder.makeAddress() + let fail = builder.makeAddress() + let end = builder.makeAddress() + + builder.buildMoveCurrentPosition(into: r0) + builder.buildSave(lhsFail) + try emitCustomCharacterClass(lhs) + builder.buildClear() + builder.label(lhsFail) + builder.buildMoveCurrentPosition(into: r1) + + builder.buildRestorePosition(from: r0) + builder.buildSave(rhsFail) + try emitCustomCharacterClass(rhs) + builder.buildClear() + builder.label(rhsFail) + builder.buildMoveCurrentPosition(into: r2) + + // If r1 == r2, then fail + builder.buildRestorePosition(from: r1) + builder.buildCondBranch(to: fail, ifSamePositionAs: r2) + + // If r1 == r0, then move to r2 before ending + builder.buildCondBranch(to: advance, ifSamePositionAs: r0) + builder.buildBranch(to: end) + builder.label(advance) + builder.buildRestorePosition(from: r2) + builder.buildBranch(to: end) + + builder.label(fail) + builder.buildFail() + builder.label(end) + } + } + mutating func emitCustomCharacterClass( _ ccc: DSLTree.CustomCharacterClass ) throws { @@ -876,8 +1057,67 @@ fileprivate extension Compiler.ByteCodeGen { } return } - let consumer = try ccc.generateConsumer(options) - builder.buildConsume(by: consumer) + + let updatedCCC: DSLTree.CustomCharacterClass + if optimizationsEnabled { + updatedCCC = ccc.coalescingASCIIMembers(options) + } else { + updatedCCC = ccc + } + let filteredMembers = updatedCCC.members.filter({!$0.isOnlyTrivia}) + + if updatedCCC.isInverted { + // inverted + // custom character class: p0 | p1 | ... | pn + // Try each member to make sure they all fail + // save next_p1 + // + // clear, fail + // next_p1: + // save next_p2 + // + // clear fail + // next_p2: + // save next_p... + // + // clear fail + // ... + // next_pn: + // save done + // + // clear fail + // done: + // step forward by 1 + let done = builder.makeAddress() + for member in filteredMembers.dropLast() { + let next = builder.makeAddress() + builder.buildSave(next) + try emitCCCMember(member) + builder.buildClear() + builder.buildFail() + builder.label(next) + } + builder.buildSave(done) + try emitCCCMember(filteredMembers.last!) + builder.buildClear() + builder.buildFail() + builder.label(done) + + // Consume a single unit for the inverted ccc + switch options.semanticLevel { + case .graphemeCluster: + builder.buildAdvance(1) + case .unicodeScalar: + builder.buildAdvanceUnicodeScalar(1) + } + return + } + // non inverted CCC + // Custom character class: p0 | p1 | ... | pn + // Very similar to alternation, but we don't keep backtracking save points + try emitAlternationGen(filteredMembers, withBacktracking: false) { + try $0.emitCCCMember($1) + } } mutating func emitConcatenation(_ children: [DSLTree.Node]) throws { @@ -1014,6 +1254,12 @@ fileprivate extension Compiler.ByteCodeGen { } extension DSLTree.Node { + /// A Boolean value indicating whether this node advances the match position + /// on a successful match. + /// + /// For example, an alternation like `(a|b|c)` always advances the position + /// by a character, but `(a|b|)` has an empty branch, which matches without + /// advancing. var guaranteesForwardProgress: Bool { switch self { case .orderedChoice(let children): @@ -1044,8 +1290,8 @@ extension DSLTree.Node { case .consumer, .matcher: // Allow zero width consumers and matchers return false - case .customCharacterClass: - return true + case .customCharacterClass(let ccc): + return ccc.guaranteesForwardProgress case .quantification(let amount, _, let child): let (atLeast, _) = amount.ast.bounds return atLeast ?? 0 > 0 && child.guaranteesForwardProgress @@ -1053,3 +1299,25 @@ extension DSLTree.Node { } } } + +extension DSLTree.CustomCharacterClass { + /// We allow trivia into CustomCharacterClass, which could result in a CCC + /// that matches nothing, ie `(?x)[ ]`. + var guaranteesForwardProgress: Bool { + for m in members { + switch m { + case .trivia: + continue + case let .intersection(lhs, rhs): + return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress + case let .subtraction(lhs, _): + return lhs.guaranteesForwardProgress + case let .symmetricDifference(lhs, rhs): + return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress + default: + return true + } + } + return false + } +} diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 99d1b3cae..898985a4d 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -18,89 +18,12 @@ extension Character { } } -extension DSLTree.Node { - /// Attempt to generate a consumer from this AST node - /// - /// A consumer is a Swift closure that matches against - /// the front of an input range - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction? { - switch self { - case .atom(let a): - return try a.generateConsumer(opts) - case .customCharacterClass(let ccc): - return try ccc.generateConsumer(opts) - - case .quotedLiteral: - // TODO: Should we handle this here? - return nil - - case let .convertedRegexLiteral(n, _): - return try n.generateConsumer(opts) - - case .orderedChoice, .conditional, .concatenation, - .capture, .nonCapturingGroup, - .quantification, .trivia, .empty, - .ignoreCapturesInTypedOutput, .absentFunction: return nil - - case .consumer: - fatalError("FIXME: Is this where we handle them?") - case .matcher: - fatalError("FIXME: Is this where we handle them?") - case .characterPredicate: - fatalError("FIXME: Is this where we handle them?") - } - } -} - extension DSLTree._AST.Atom { var singleScalarASCIIValue: UInt8? { return ast.singleScalarASCIIValue } } -extension Character { - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { - let isCaseInsensitive = opts.isCaseInsensitive - switch opts.semanticLevel { - case .graphemeCluster: - return { input, bounds in - guard let (char, next) = input.characterAndEnd( - at: bounds.lowerBound, limitedBy: bounds.upperBound) - else { return nil } - if isCaseInsensitive && isCased { - return char.lowercased() == self.lowercased() - ? next - : nil - } else { - return char == self - ? next - : nil - } - } - case .unicodeScalar: - // TODO: This should only be reachable from character class emission, can - // we guarantee that? Otherwise we'd want a different matching behavior. - let consumers = unicodeScalars.map { s in consumeScalar { - isCaseInsensitive - ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping - : $0 == s - }} - return { input, bounds in - for fn in consumers { - if let idx = fn(input, bounds) { - return idx - } - } - return nil - } - } - } -} - extension DSLTree.Atom { var singleScalarASCIIValue: UInt8? { switch self { @@ -114,85 +37,6 @@ extension DSLTree.Atom { return nil } } - - // TODO: If ByteCodeGen switches first, then this is unnecessary for - // top-level nodes, but it's also invoked for `.atom` members of a custom CC - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction? { - switch self { - case let .char(c): - return try c.generateConsumer(opts) - - case let .scalar(s): - // A scalar always matches the same as a single scalar character. This - // means it must match a whole grapheme in grapheme semantic mode, but - // can match a single scalar in scalar semantic mode. - return try Character(s).generateConsumer(opts) - - case .any: - // FIXME: Should this be a total ordering? - if opts.semanticLevel == .graphemeCluster { - return { input, bounds in - input.index(after: bounds.lowerBound) - } - } else { - return consumeScalar { _ in - true - } - } - - case .anyNonNewline: - switch opts.semanticLevel { - case .graphemeCluster: - return { input, bounds in - input[bounds.lowerBound].isNewline - ? nil - : input.index(after: bounds.lowerBound) - } - case .unicodeScalar: - return { input, bounds in - input[bounds.lowerBound].isNewline - ? nil - : input.unicodeScalars.index(after: bounds.lowerBound) - } - } - - case .dot: - throw Unreachable(".atom(.dot) should be handled by emitDot") - - case .assertion: - // TODO: We could handle, should this be total? - return nil - case .characterClass(let cc): - return cc.generateConsumer(opts) - - case .backreference: - // TODO: Should we handle? - return nil - - case .symbolicReference: - // TODO: Should we handle? - return nil - - case .changeMatchingOptions: - // TODO: Should we handle? - return nil - - case let .unconverted(a): - return try a.ast.generateConsumer(opts) - } - - } -} - -extension DSLTree.Atom.CharacterClass { - func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction { - let model = asRuntimeModel(opts) - return { input, bounds in - model.matches(in: input, at: bounds.lowerBound, limitedBy: bounds.upperBound) - } - } } extension String { @@ -283,47 +127,20 @@ extension AST.Atom { _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { switch kind { - case let .scalar(s): - assertionFailure( - "Should have been handled by tree conversion") - return consumeScalar { $0 == s.value } - - case let .char(c): - assertionFailure( - "Should have been handled by tree conversion") - - // TODO: Match level? - return { input, bounds in - let low = bounds.lowerBound - guard input[low] == c else { - return nil - } - return input.index(after: low) - } - case let .property(p): return try p.generateConsumer(opts) case let .namedCharacter(name): return consumeName(name, opts: opts) - case .dot: - assertionFailure( - "Should have been handled by tree conversion") - fatalError(".atom(.dot) is handled in emitDot") - - case .caretAnchor, .dollarAnchor: - // handled in emitAssertion - return nil - case .escaped: - // handled in emitAssertion and emitCharacterClass - return nil - case .scalarSequence, .keyboardControl, .keyboardMeta, .keyboardMetaControl, .backreference, .subpattern, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: // FIXME: implement return nil + + case .char, .scalar, .escaped, .dot, .caretAnchor, .dollarAnchor: + fatalError("Handled in ByteCodeGen or earlier.") #if RESILIENT_LIBRARIES @unknown default: @@ -367,11 +184,6 @@ extension DSLTree.CustomCharacterClass.Member { _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { switch self { - case let .atom(a): - guard let c = try a.generateConsumer(opts) else { - throw Unsupported("Consumer for \(a)") - } - return c case let .range(low, high): guard let lhsChar = low.literalCharacterValue else { throw Unsupported("\(low) in range") @@ -437,57 +249,12 @@ extension DSLTree.CustomCharacterClass.Member { return nil } - case let .custom(ccc): - return try ccc.generateConsumer(opts) - - case let .intersection(lhs, rhs): - let lhs = try lhs.generateConsumer(opts) - let rhs = try rhs.generateConsumer(opts) - return { input, bounds in - if let lhsIdx = lhs(input, bounds), - let rhsIdx = rhs(input, bounds) - { - guard lhsIdx == rhsIdx else { - fatalError("TODO: What should we do here?") - } - return lhsIdx - } - return nil - } + case .atom, .custom, .intersection, .subtraction, .symmetricDifference: + fatalError("Handled in 'emitCustomCharacterClass'") - case let .subtraction(lhs, rhs): - let lhs = try lhs.generateConsumer(opts) - let rhs = try rhs.generateConsumer(opts) - return { input, bounds in - if let lhsIdx = lhs(input, bounds), - rhs(input, bounds) == nil - { - return lhsIdx - } - return nil - } + case .quotedLiteral: + fatalError("Removed in 'flatteningCustomCharacterClassMembers'") - case let .symmetricDifference(lhs, rhs): - let lhs = try lhs.generateConsumer(opts) - let rhs = try rhs.generateConsumer(opts) - return { input, bounds in - if let lhsIdx = lhs(input, bounds) { - return rhs(input, bounds) == nil ? lhsIdx : nil - } - return rhs(input, bounds) - } - case .quotedLiteral(let str): - let consumers = try str.map { - try $0.generateConsumer(opts) - } - return { input, bounds in - for fn in consumers { - if let idx = fn(input, bounds) { - return idx - } - } - return nil - } case .trivia: // TODO: Should probably strip this earlier... return { _, _ in nil } @@ -508,28 +275,6 @@ extension DSLTree.CustomCharacterClass { } ) } - - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { - // NOTE: Easy way to implement, obviously not performant - let consumers = try members.map { - try $0.generateConsumer(opts) - } - return { input, bounds in - for consumer in consumers { - if let idx = consumer(input, bounds) { - return isInverted ? nil : idx - } - } - if isInverted { - return opts.semanticLevel == .graphemeCluster - ? Swift.min(input.index(after: bounds.lowerBound), bounds.upperBound) - : input.unicodeScalars.index(after: bounds.lowerBound) - } - return nil - } - } } // NOTE: Conveniences, though not most performant @@ -638,7 +383,6 @@ extension AST.Atom.CharacterProperty { case .generalCategory(let p): return try p.generateConsumer(opts) -// fatalError("TODO: Map categories: \(p)") case .binary(let prop, value: let value): let cons = try prop.generateConsumer(opts) diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 21ab90a03..011174660 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -44,6 +44,14 @@ extension Instruction { /// Operands: /// - Position register to move into case moveCurrentPosition + + /// Set the current position to the value stored in the register + /// + /// restorePosition(from: PositionRegister) + /// + /// Operands: + /// - Position register to read from + case restorePosition // MARK: General Purpose: Control flow diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 44c938e71..1f5611f98 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -84,6 +84,10 @@ extension MEProgram.Builder { .init(instructions.endIndex - 1) } + mutating func buildFatalError() { + instructions.append(.init(.invalid)) + } + mutating func buildMoveImmediate( _ value: UInt64, into: IntRegister ) { @@ -315,6 +319,10 @@ extension MEProgram.Builder { instructions.append(.init(.moveCurrentPosition, .init(position: r))) } + mutating func buildRestorePosition(from r: PositionRegister) { + instructions.append(.init(.restorePosition, .init(position: r))) + } + mutating func buildBackreference( _ cap: CaptureRegister, isScalarMode: Bool diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 86365322b..0de7c2f70 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -175,14 +175,25 @@ extension Processor { // save point was restored mutating func consume(_ n: Distance) -> Bool { // TODO: needs benchmark coverage - guard let idx = input.index( + if let idx = input.index( currentPosition, offsetBy: n.rawValue, limitedBy: end - ) else { - signalFailure() - return false + ) { + currentPosition = idx + return true } - currentPosition = idx - return true + + // If `end` falls in the middle of a character, and we are trying to advance + // by one "character", then we should max out at `end` even though the above + // advancement will result in `nil`. + if n == 1, let idx = input.unicodeScalars.index( + currentPosition, offsetBy: n.rawValue, limitedBy: end + ) { + currentPosition = idx + return true + } + + signalFailure() + return false } // Advances in unicode scalar view @@ -413,6 +424,10 @@ extension Processor { let reg = payload.position registers[reg] = currentPosition controller.step() + case .restorePosition: + let reg = payload.position + currentPosition = registers[reg] + controller.step() case .branch: controller.pc = payload.addr diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 9f0e2df23..2cc4d032f 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -131,6 +131,23 @@ extension DSLTree { } } + func coalescingASCIIMembers(_ opts: MatchingOptions) -> CustomCharacterClass { + var ascii: [Member] = [] + var nonAscii: [Member] = [] + for member in members { + if member.asAsciiBitset(opts, false) != nil { + ascii.append(member) + } else { + nonAscii.append(member) + } + } + if ascii.isEmpty || nonAscii.isEmpty { return self } + return CustomCharacterClass(members: [ + .custom(CustomCharacterClass(members: ascii)), + .custom(CustomCharacterClass(members: nonAscii)) + ], isInverted: isInverted) + } + public init(members: [DSLTree.CustomCharacterClass.Member], isInverted: Bool = false) { self.members = members self.isInverted = isInverted @@ -161,6 +178,17 @@ extension DSLTree { indirect case intersection(CustomCharacterClass, CustomCharacterClass) indirect case subtraction(CustomCharacterClass, CustomCharacterClass) indirect case symmetricDifference(CustomCharacterClass, CustomCharacterClass) + + var isOnlyTrivia: Bool { + switch self { + case .custom(let ccc): + return ccc.members.all(\.isOnlyTrivia) + case .trivia: + return true + default: + return false + } + } } } diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index aafe752bc..d0500847b 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -19,6 +19,7 @@ enum DecodedInstr { case invalid case moveImmediate case moveCurrentPosition + case restorePosition case branch case condBranchZeroElseDecrement case condBranchSamePosition @@ -65,6 +66,8 @@ extension DecodedInstr { return .moveImmediate case .moveCurrentPosition: return .moveCurrentPosition + case .restorePosition: + return .restorePosition case .branch: return .branch case .condBranchZeroElseDecrement: @@ -368,6 +371,48 @@ extension RegexTests { semanticLevel: .unicodeScalar, contains: [.matchBitsetScalar], doesNotContain: [.matchBitset, .consumeBy]) + expectProgram( + for: "[a-c]", + contains: [.matchBitset], + doesNotContain: [.consumeBy, .matchBitsetScalar]) + expectProgram( + for: "[a-c0123]", + contains: [.matchBitset], + doesNotContain: [.matchBitsetScalar, .consumeBy]) + expectProgram( + for: #"\w"#, + contains: [.matchBuiltin], + doesNotContain: [.consumeBy, .matchBitset, .matchBitsetScalar]) + expectProgram( + for: #"[\w]"#, + contains: [.matchBuiltin], + doesNotContain: [.consumeBy, .matchBitset, .matchBitsetScalar]) + expectProgram( + for: #"\w"#, + semanticLevel: .unicodeScalar, + contains: [.matchBuiltin], + doesNotContain: [.consumeBy, .matchBitset, .matchBitsetScalar]) + expectProgram( + for: #"[\w]"#, + semanticLevel: .unicodeScalar, + contains: [.matchBuiltin], + doesNotContain: [.consumeBy, .matchBitset, .matchBitsetScalar]) + expectProgram( + for: #"\p{Greek}"#, + contains: [.consumeBy], + doesNotContain: [.matchBuiltin, .matchBitset, .matchBitsetScalar]) + + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + + expectProgram( + for: "[a-á]", + contains: [.consumeBy], + doesNotContain: [.matchBitset, .matchBitsetScalar]) + expectProgram( + for: "[a-fá-ém-zk]", + contains: [.matchBitset, .consumeBy], + doesNotContain: [.matchBitsetScalar]) } func testScalarOptimizeCompilation() {