diff --git a/Sources/_MatchingEngine/Regex/AST/AST.swift b/Sources/_MatchingEngine/Regex/AST/AST.swift index 693acaec5..8a9a2560f 100644 --- a/Sources/_MatchingEngine/Regex/AST/AST.swift +++ b/Sources/_MatchingEngine/Regex/AST/AST.swift @@ -38,6 +38,8 @@ public indirect enum AST: case customCharacterClass(CustomCharacterClass) + case absentFunction(AbsentFunction) + case empty(Empty) // FIXME: Move off the regex literal AST @@ -55,16 +57,17 @@ extension AST { // over `self` _everywhere_ we want to do anything. var _associatedValue: _ASTNode { switch self { - case let .alternation(v): return v - case let .concatenation(v): return v - case let .group(v): return v - case let .conditional(v): return v - case let .quantification(v): return v - case let .quote(v): return v - case let .trivia(v): return v - case let .atom(v): return v - case let .customCharacterClass(v): return v - case let .empty(v): return v + case let .alternation(v): return v + case let .concatenation(v): return v + case let .group(v): return v + case let .conditional(v): return v + case let .quantification(v): return v + case let .quote(v): return v + case let .trivia(v): return v + case let .atom(v): return v + case let .customCharacterClass(v): return v + case let .empty(v): return v + case let .absentFunction(v): return v case let .groupTransform(g, _): return g // FIXME: get this out of here @@ -110,7 +113,7 @@ extension AST { switch self { case .atom(let a): return a.isQuantifiable - case .group, .conditional, .customCharacterClass: + case .group, .conditional, .customCharacterClass, .absentFunction: return true case .alternation, .concatenation, .quantification, .quote, .trivia, .empty, .groupTransform: @@ -185,6 +188,50 @@ extension AST { } } + /// An Oniguruma absent function. This is used to model a pattern which should + /// not be matched against across varying scopes. + public struct AbsentFunction: Hashable, _ASTNode { + public enum Start: Hashable { + /// `(?~|` + case withPipe + + /// `(?~` + case withoutPipe + } + public enum Kind: Hashable { + /// An absent repeater `(?~absent)`. This is equivalent to `(?~|absent|.*)` + /// and therefore matches as long as the pattern `absent` is not matched. + case repeater(AST) + + /// An absent expression `(?~|absent|expr)`, which defines an `absent` + /// pattern which must not be matched against while the pattern `expr` is + /// matched. + case expression(absentee: AST, pipe: SourceLocation, expr: AST) + + /// An absent stopper `(?~|absent)`, which prevents matching against + /// `absent` until the end of the regex, or until it is cleared. + case stopper(AST) + + /// An absent clearer `(?~|)` which cancels the effect of an absent + /// stopper. + case clearer + } + /// The location of `(?~` or `(?~|` + public var start: SourceLocation + + public var kind: Kind + + public var location: SourceLocation + + public init( + _ kind: Kind, start: SourceLocation, location: SourceLocation + ) { + self.kind = kind + self.start = start + self.location = location + } + } + public struct Reference: Hashable { @frozen public enum Kind: Hashable { diff --git a/Sources/_MatchingEngine/Regex/AST/ASTProtocols.swift b/Sources/_MatchingEngine/Regex/AST/ASTProtocols.swift index 52017f51c..6c3e3231c 100644 --- a/Sources/_MatchingEngine/Regex/AST/ASTProtocols.swift +++ b/Sources/_MatchingEngine/Regex/AST/ASTProtocols.swift @@ -40,3 +40,12 @@ extension AST.Group: _ASTParent { extension AST.Quantification: _ASTParent { var children: [AST] { [child] } } +extension AST.AbsentFunction: _ASTParent { + var children: [AST] { + switch kind { + case .repeater(let a), .stopper(let a): return [a] + case .expression(let a, _, let c): return [a, c] + case .clearer: return [] + } + } +} diff --git a/Sources/_MatchingEngine/Regex/AST/Atom.swift b/Sources/_MatchingEngine/Regex/AST/Atom.swift index 10b616bd2..b5ddeec2a 100644 --- a/Sources/_MatchingEngine/Regex/AST/Atom.swift +++ b/Sources/_MatchingEngine/Regex/AST/Atom.swift @@ -476,14 +476,117 @@ extension AST.Atom { } extension AST.Atom { - public struct Callout: Hashable { - public enum Argument: Hashable { - case number(Int) - case string(String) + public enum Callout: Hashable { + /// A PCRE callout written `(?C...)` + public struct PCRE: Hashable { + public enum Argument: Hashable { + case number(Int) + case string(String) + } + public var arg: AST.Located + + public init(_ arg: AST.Located) { + self.arg = arg + } + + /// Whether the argument isn't written explicitly in the source, e.g + /// `(?C)` which is implicitly `(?C0)`. + public var isImplicit: Bool { arg.location.isEmpty } } - public var arg: AST.Located - public init(_ arg: AST.Located) { - self.arg = arg + + /// A named Oniguruma callout written `(*name[tag]{args, ...})` + public struct OnigurumaNamed: Hashable { + public struct ArgList: Hashable { + public var leftBrace: SourceLocation + public var args: [AST.Located] + public var rightBrace: SourceLocation + + public init( + _ leftBrace: SourceLocation, + _ args: [AST.Located], + _ rightBrace: SourceLocation + ) { + self.leftBrace = leftBrace + self.args = args + self.rightBrace = rightBrace + } + } + + public var name: AST.Located + public var tag: OnigurumaTag? + public var args: ArgList? + + public init( + _ name: AST.Located, tag: OnigurumaTag?, args: ArgList? + ) { + self.name = name + self.tag = tag + self.args = args + } + } + + /// An Oniguruma callout 'of contents', written `(?{...}[tag]D)` + public struct OnigurumaOfContents: Hashable { + public enum Direction: Hashable { + case inProgress // > (the default) + case inRetraction // < + case both // X + } + public var openBraces: SourceLocation + public var contents: AST.Located + public var closeBraces: SourceLocation + public var tag: OnigurumaTag? + public var direction: AST.Located + + public init( + _ openBraces: SourceLocation, _ contents: AST.Located, + _ closeBraces: SourceLocation, tag: OnigurumaTag?, + direction: AST.Located + ) { + self.openBraces = openBraces + self.contents = contents + self.closeBraces = closeBraces + self.tag = tag + self.direction = direction + } + + /// Whether the direction flag isn't written explicitly in the + /// source, e.g `(?{x})` which is implicitly `(?{x}>)`. + public var isDirectionImplicit: Bool { direction.location.isEmpty } + } + case pcre(PCRE) + case onigurumaNamed(OnigurumaNamed) + case onigurumaOfContents(OnigurumaOfContents) + + private var _associatedValue: Any { + switch self { + case .pcre(let v): return v + case .onigurumaNamed(let v): return v + case .onigurumaOfContents(let v): return v + } + } + + func `as`(_ t: T.Type = T.self) -> T? { + _associatedValue as? T + } + } +} + +extension AST.Atom.Callout { + /// A tag specifier `[...]` which may appear in an Oniguruma callout. + public struct OnigurumaTag: Hashable { + public var leftBracket: SourceLocation + public var name: AST.Located + public var rightBracket: SourceLocation + + public init( + _ leftBracket: SourceLocation, + _ name: AST.Located, + _ rightBracket: SourceLocation + ) { + self.leftBracket = leftBracket + self.name = name + self.rightBracket = rightBracket } } } @@ -594,7 +697,7 @@ extension AST { case .alternation, .concatenation, .group, .conditional, .quantification, .quote, .trivia, .customCharacterClass, .empty, - .groupTransform: + .groupTransform, .absentFunction: return nil } } diff --git a/Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift b/Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift index b87bb4454..e8425ab16 100644 --- a/Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift +++ b/Sources/_MatchingEngine/Regex/Parse/CaptureStructure.swift @@ -81,6 +81,15 @@ extension AST { quantification.amount.value == .zeroOrOne ? CaptureStructure.optional : CaptureStructure.array) + case .absentFunction(let abs): + // Only the child of an expression absent function is relevant, as the + // other expressions don't actually get matched against. + switch abs.kind { + case .expression(_, _, let child): + return child.captureStructure + case .clearer, .repeater, .stopper: + return .empty + } case .quote, .trivia, .atom, .customCharacterClass, .empty: return .empty } diff --git a/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift b/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift index d5e4f122b..6740d6b90 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift @@ -31,6 +31,8 @@ enum ParseError: Error, Hashable { case tooManyBranchesInConditional(Int) case unsupportedCondition(String) + case tooManyAbsentExpressionChildren(Int) + case expectedASCII(Character) case expectedNonEmptyContents @@ -55,10 +57,25 @@ enum ParseError: Error, Hashable { case emptyProperty case expectedGroupSpecifier - case expectedGroupName - case groupNameMustBeAlphaNumeric - case groupNameCannotStartWithNumber + case unbalancedEndOfGroup + + // Identifier diagnostics. + case expectedIdentifier(IdentifierKind) + case identifierMustBeAlphaNumeric(IdentifierKind) + case identifierCannotStartWithNumber(IdentifierKind) + case cannotRemoveTextSegmentOptions + case expectedCalloutArgument +} + +extension IdentifierKind { + fileprivate var diagDescription: String { + switch self { + case .groupName: return "group name" + case .onigurumaCalloutName: return "callout name" + case .onigurumaCalloutTag: return "callout tag" + } + } } extension ParseError: CustomStringConvertible { @@ -96,6 +113,8 @@ extension ParseError: CustomStringConvertible { return "expected 2 branches in conditional, have \(i)" case let .unsupportedCondition(str): return "\(str) cannot be used as condition" + case let .tooManyAbsentExpressionChildren(i): + return "expected 2 expressions in absent expression, have \(i)" case let .unknownGroupKind(str): return "unknown group kind '(\(str)'" case let .unknownCalloutKind(str): @@ -116,14 +135,18 @@ extension ParseError: CustomStringConvertible { return "empty property" case .expectedGroupSpecifier: return "expected group specifier" - case .expectedGroupName: - return "expected group name" - case .groupNameMustBeAlphaNumeric: - return "group name must only contain alphanumeric characters" - case .groupNameCannotStartWithNumber: - return "group name must not start with number" + case .unbalancedEndOfGroup: + return "closing ')' does not balance any groups openings" + case .expectedIdentifier(let i): + return "expected \(i.diagDescription)" + case .identifierMustBeAlphaNumeric(let i): + return "\(i.diagDescription) must only contain alphanumeric characters" + case .identifierCannotStartWithNumber(let i): + return "\(i.diagDescription) must not start with number" case .cannotRemoveTextSegmentOptions: return "text segment mode cannot be unset, only changed" + case .expectedCalloutArgument: + return "expected argument to callout" } } } diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index fe9deea35..39c7ad346 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -62,12 +62,14 @@ extension Source { /// Record source loc before processing and return /// or throw the value/error with source locations. + @discardableResult fileprivate mutating func recordLoc( _ f: (inout Self) throws -> () - ) rethrows { + ) rethrows -> SourceLocation { let start = currentPosition do { try f(&self) + return SourceLocation(start.. { throw e } catch let e as ParseError { @@ -83,8 +85,9 @@ extension Source { typealias Quant = AST.Quantification /// Throws an expected character error if not matched - mutating func expect(_ c: Character) throws { - _ = try recordLoc { src in + @discardableResult + mutating func expect(_ c: Character) throws -> SourceLocation { + try recordLoc { src in guard src.tryEat(c) else { throw ParseError.expected(String(c)) } @@ -182,6 +185,12 @@ enum RadixKind { } } +enum IdentifierKind { + case groupName + case onigurumaCalloutName + case onigurumaCalloutTag +} + extension Source { /// Validate a string of digits as a particular radix, and return the number, /// or throw an error if the string is malformed or would overflow the number @@ -428,14 +437,16 @@ extension Source { /// delimiter. If `ignoreEscaped` is true, escaped characters will not be /// considered for the ending delimiter. private mutating func expectQuoted( - endingWith end: String, ignoreEscaped: Bool = false, eatEnding: Bool = true + endingWith endSingle: String, count: Int = 1, ignoreEscaped: Bool = false, + eatEnding: Bool = true ) throws -> Located { + let end = String(repeating: endSingle, count: count) let result = try recordLoc { src -> String in try src.lexUntil { src in if src.starts(with: end) { return true } - try src.expectNonEmpty(.expected(end)) + try src.expectNonEmpty(.expected(endSingle)) // Ignore escapes if we're allowed to. lexUntil will consume the next // character. @@ -659,19 +670,22 @@ extension Source { } } - /// Consume a group name. - private mutating func expectGroupName( - endingWith ending: String, eatEnding: Bool = true + /// Consume an identifier. + /// + /// Identifier -> [\w--\d] \w* + /// + private mutating func expectIdentifier( + _ kind: IdentifierKind, endingWith ending: String, eatEnding: Bool = true ) throws -> Located { let str = try recordLoc { src -> String in if src.isEmpty || src.tryEat(sequence: ending) { - throw ParseError.expectedGroupName + throw ParseError.expectedIdentifier(kind) } if src.peek()!.isNumber { - throw ParseError.groupNameCannotStartWithNumber + throw ParseError.identifierCannotStartWithNumber(kind) } guard let str = src.tryEatPrefix(\.isWordCharacter)?.string else { - throw ParseError.groupNameMustBeAlphaNumeric + throw ParseError.identifierMustBeAlphaNumeric(kind) } return str } @@ -681,13 +695,22 @@ extension Source { return str } + /// Try to consume an identifier, returning `nil` if unsuccessful. + private mutating func lexIdentifier( + _ kind: IdentifierKind, endingWith end: String, eatEnding: Bool = true + ) -> Located? { + tryEating { src in + try? src.expectIdentifier(kind, endingWith: end, eatEnding: eatEnding) + } + } + /// Consume a named group field, producing either a named capture or balanced /// capture. /// /// NamedGroup -> 'P<' GroupNameBody '>' /// | '<' GroupNameBody '>' /// | "'" GroupNameBody "'" - /// GroupNameBody -> \w+ | \w* '-' \w+ + /// GroupNameBody -> Identifier | Identifier? '-' Identifier /// private mutating func expectNamedGroup( endingWith ending: String @@ -695,14 +718,16 @@ extension Source { func lexBalanced(_ lhs: Located? = nil) throws -> AST.Group.Kind? { // If we have a '-', this is a .NET-style 'balanced group'. guard let dash = tryEatWithLoc("-") else { return nil } - let rhs = try expectGroupName(endingWith: ending) + let rhs = try expectIdentifier(.groupName, endingWith: ending) return .balancedCapture(.init(name: lhs, dash: dash, priorName: rhs)) } // Lex a group name, trying to lex a '-rhs' for a balanced capture group // both before and after. if let b = try lexBalanced() { return b } - let name = try expectGroupName(endingWith: ending, eatEnding: false) + let name = try expectIdentifier( + .groupName, endingWith: ending, eatEnding: false + ) if let b = try lexBalanced(name) { return b } try expect(sequence: ending) @@ -942,6 +967,19 @@ extension Source { } } + /// Try to consume the start of an absent function. + /// + /// AbsentFunctionStart -> '(?~' '|'? + /// + mutating func lexAbsentFunctionStart( + ) -> Located? { + recordLoc { src in + if src.tryEat(sequence: "(?~|") { return .withPipe } + if src.tryEat(sequence: "(?~") { return .withoutPipe } + return nil + } + } + mutating func lexCustomCCStart( ) throws -> Located? { recordLoc { src in @@ -1105,7 +1143,8 @@ extension Source { ) throws -> AST.Reference { // Note we don't want to eat the ending as we may also want to parse a // recursion level. - let str = try expectGroupName(endingWith: end, eatEnding: false) + let str = try expectIdentifier( + .groupName, endingWith: end, eatEnding: false) // If we're allowed to, try parse a recursion level. let recLevel = allowRecursionLevel ? try lexRecursionLevel() : nil @@ -1321,12 +1360,15 @@ extension Source { // The start of a reference '(?P=', '(?R', ... if src.canLexGroupLikeReference() { return true } - // The start of a callout. + // The start of a PCRE callout. if src.tryEat("C") { return true } + // The start of an Oniguruma 'of-contents' callout. + if src.tryEat("{") { return true } + return false } - // The start of a backreference directive. + // The start of a backreference directive or Oniguruma named callout. if src.tryEat("*") { return true } return false @@ -1393,22 +1435,22 @@ extension Source { } } - /// Try to consume a callout. + /// Try to consume a PCRE callout. /// - /// Callout -> '(?C' CalloutBody ')' - /// CalloutBody -> '' | - /// | '`' '`' - /// | "'" "'" - /// | '"' '"' - /// | '^' '^' - /// | '%' '%' - /// | '#' '#' - /// | '$' '$' - /// | '{' '}' + /// PCRECallout -> '(?C' CalloutBody ')' + /// PCRECalloutBody -> '' | + /// | '`' '`' + /// | "'" "'" + /// | '"' '"' + /// | '^' '^' + /// | '%' '%' + /// | '#' '#' + /// | '$' '$' + /// | '{' '}' /// - mutating func lexCallout() throws -> AST.Atom.Callout? { + mutating func lexPCRECallout() throws -> AST.Atom.Callout? { guard tryEat(sequence: "(?C") else { return nil } - let arg = try recordLoc { src -> AST.Atom.Callout.Argument in + let arg = try recordLoc { src -> AST.Atom.Callout.PCRE.Argument in // Parse '(?C' followed by a number. if let num = try src.lexNumber() { return .number(num.value) @@ -1432,7 +1474,104 @@ extension Source { throw ParseError.unknownCalloutKind("(?C\(remaining))") } try expect(")") - return .init(arg) + return .pcre(.init(arg)) + } + + /// Consume a list of arguments for an Oniguruma callout. + /// + /// OnigurumaCalloutArgList -> OnigurumaCalloutArg (',' OnigurumaCalloutArgList)* + /// OnigurumaCalloutArg -> [^,}]+ + /// + mutating func expectOnigurumaCalloutArgList( + leftBrace: SourceLocation + ) throws -> AST.Atom.Callout.OnigurumaNamed.ArgList { + var args: [Located] = [] + while true { + let arg = try recordLoc { src -> String in + // TODO: Warn about whitespace being included? + guard let arg = src.tryEatPrefix({ $0 != "," && $0 != "}" }) else { + throw ParseError.expectedCalloutArgument + } + return arg.string + } + args.append(arg) + + if peek() == "}" { break } + try expect(",") + } + let rightBrace = try expect("}") + return .init(leftBrace, args, rightBrace) + } + + /// Try to consume an Oniguruma callout tag. + /// + /// OnigurumaTag -> '[' Identifier ']' + /// + mutating func lexOnigurumaCalloutTag( + ) throws -> AST.Atom.Callout.OnigurumaTag? { + guard let leftBracket = tryEatWithLoc("[") else { return nil } + let name = try expectIdentifier( + .onigurumaCalloutTag, endingWith: "]", eatEnding: false + ) + let rightBracket = try expect("]") + return .init(leftBracket, name, rightBracket) + } + + /// Try to consume a named Oniguruma callout. + /// + /// OnigurumaNamedCallout -> '(*' Identifier OnigurumaTag? Args? ')' + /// Args -> '{' OnigurumaCalloutArgList '}' + /// + mutating func lexOnigurumaNamedCallout() throws -> AST.Atom.Callout? { + try tryEating { src in + guard src.tryEat(sequence: "(*") else { return nil } + guard let name = src.lexIdentifier( + .onigurumaCalloutName, endingWith: ")", eatEnding: false) + else { return nil } + + let tag = try src.lexOnigurumaCalloutTag() + + let args = try src.tryEatWithLoc("{").map { + try src.expectOnigurumaCalloutArgList(leftBrace: $0) + } + try src.expect(")") + return .onigurumaNamed(.init(name, tag: tag, args: args)) + } + } + + /// Try to consume an Oniguruma callout 'of contents'. + /// + /// OnigurumaCalloutOfContents -> '(?' '{'+ Contents '}'+ OnigurumaTag? Direction? ')' + /// Contents -> + /// Direction -> 'X' | '<' | '>' + /// + mutating func lexOnigurumaCalloutOfContents() throws -> AST.Atom.Callout? { + try tryEating { src in + guard src.tryEat(sequence: "(?"), + let openBraces = src.tryEatPrefix({ $0 == "{" }) + else { return nil } + + let contents = try src.expectQuoted( + endingWith: "}", count: openBraces.count) + let closeBraces = SourceLocation( + contents.location.end ..< src.currentPosition) + + let tag = try src.lexOnigurumaCalloutTag() + + typealias Direction = AST.Atom.Callout.OnigurumaOfContents.Direction + let direction = src.recordLoc { src -> Direction in + if src.tryEat(">") { return .inProgress } + if src.tryEat("<") { return .inRetraction } + if src.tryEat("X") { return .both } + // The default is in-progress. + return .inProgress + } + try src.expect(")") + + let openBracesLoc = SourceLocation(from: openBraces) + return .onigurumaOfContents(.init( + openBracesLoc, contents, closeBraces, tag: tag, direction: direction)) + } } /// Try to consume a backtracking directive. @@ -1485,14 +1624,25 @@ extension Source { return ref.value } + // (*ACCEPT), (*FAIL), (*MARK), ... + if let b = try src.lexBacktrackingDirective() { + return .backtrackingDirective(b) + } + // (?C) - if let callout = try src.lexCallout() { + if let callout = try src.lexPCRECallout() { return .callout(callout) } - // (*ACCEPT), (*FAIL), (*MARK), ... - if let b = try src.lexBacktrackingDirective() { - return .backtrackingDirective(b) + // Try to consume an Oniguruma named callout '(*name)', which should be + // done after backtracking directives and global options. + if let callout = try src.lexOnigurumaNamedCallout() { + return .callout(callout) + } + + // (?{...}) + if let callout = try src.lexOnigurumaCalloutOfContents() { + return .callout(callout) } // If we didn't produce an atom, consume up until a reasonable end-point diff --git a/Sources/_MatchingEngine/Regex/Parse/Parse.swift b/Sources/_MatchingEngine/Regex/Parse/Parse.swift index a633ee95b..01629e99d 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Parse.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Parse.swift @@ -114,12 +114,29 @@ extension Parser { } extension Parser { - /// Parse a regular expression + /// Parse a top-level regular expression. Do not use for recursive calls, use + /// `parseNode()` instead. /// - /// Regex -> '' | Alternation - /// Alternation -> Concatenation ('|' Concatenation)* + /// Regex -> RegexNode /// mutating func parse() throws -> AST { + let ast = try parseNode() + guard source.isEmpty else { + if let loc = source.tryEatWithLoc(")") { + throw Source.LocatedError(ParseError.unbalancedEndOfGroup, loc) + } + fatalError("Unhandled termination condition") + } + return ast + } + + /// Parse a regular expression node. This should be used instead of `parse()` + /// for recursive calls. + /// + /// RegexNode -> '' | Alternation + /// Alternation -> Concatenation ('|' Concatenation)* + /// + mutating func parseNode() throws -> AST { let _start = source.currentPosition if source.isEmpty { return .empty(.init(loc(_start))) } @@ -203,7 +220,7 @@ extension Parser { mutating func parseConditionalBranches( start: Source.Position, _ cond: AST.Conditional.Condition ) throws -> AST { - let child = try parse() + let child = try parseNode() let trueBranch: AST, falseBranch: AST, pipe: SourceLocation? switch child { case .alternation(let a): @@ -237,7 +254,7 @@ extension Parser { ) throws -> AST.Group { context.recordGroup(kind.value) - let child = try parse() + let child = try parseNode() // An implicit scoped group has already consumed its closing paren. if !kind.value.hasImplicitScope { try source.expect(")") @@ -245,11 +262,58 @@ extension Parser { return .init(kind, child, loc(start)) } + /// Consume the body of an absent function. + /// + /// AbsentFunction -> '(?~' RegexNode ')' + /// | '(?~|' Concatenation '|' Concatenation ')' + /// | '(?~|' Concatenation ')' + /// | '(?~|)' + /// + mutating func parseAbsentFunctionBody( + _ start: AST.Located + ) throws -> AST.AbsentFunction { + let startLoc = start.location + + // TODO: Diagnose on nested absent functions, which Oniguruma states is + // undefined behavior. + let kind: AST.AbsentFunction.Kind + switch start.value { + case .withoutPipe: + // Must be a repeater. + kind = .repeater(try parseNode()) + case .withPipe where source.peek() == ")": + kind = .clearer + case .withPipe: + // Can either be an expression or stopper depending on whether we have a + // any additional '|'s. + let child = try parseNode() + switch child { + case .alternation(let alt): + // A pipe, so an expression. + let numChildren = alt.children.count + guard numChildren == 2 else { + throw Source.LocatedError( + ParseError.tooManyAbsentExpressionChildren(numChildren), + child.location + ) + } + kind = .expression( + absentee: alt.children[0], pipe: alt.pipes[0], expr: alt.children[1]) + default: + // No pipes, so a stopper. + kind = .stopper(child) + } + } + try source.expect(")") + return .init(kind, start: startLoc, location: loc(startLoc.start)) + } + /// Parse a (potentially quantified) component /// - /// QuantOperand -> Conditional | Group | CustomCharClass | Atom - /// Group -> GroupStart Regex ')' - /// Conditional -> ConditionalStart Concatenation ('|' Concatenation)? ')' + /// QuantOperand -> Conditional | Group | CustomCharClass | Atom + /// | AbsentFunction + /// Group -> GroupStart RecursiveRegex ')' + /// Conditional -> ConditionalStart Concatenation ('|' Concatenation)? ')' /// ConditionalStart -> KnownConditionalStart | GroupConditionalStart /// mutating func parseQuantifierOperand() throws -> AST? { @@ -269,6 +333,11 @@ extension Parser { start: _start, .init(.group(group), group.location)) } + // Check if we have an Oniguruma absent function. + if let start = source.lexAbsentFunctionStart() { + return .absentFunction(try parseAbsentFunctionBody(start)) + } + // Check if we have the start of a group '('. if let kind = try source.lexGroupStart() { return .group(try parseGroupBody(start: _start, kind)) diff --git a/Sources/_MatchingEngine/Regex/Parse/SourceLocation.swift b/Sources/_MatchingEngine/Regex/Parse/SourceLocation.swift index c8832539d..000652391 100644 --- a/Sources/_MatchingEngine/Regex/Parse/SourceLocation.swift +++ b/Sources/_MatchingEngine/Regex/Parse/SourceLocation.swift @@ -27,6 +27,9 @@ extension Source { ) where R.Bound == Source.Position { self.init(r.relative(to: input.input)) } + public init(from sub: Input.SubSequence) { + self.init(sub.startIndex ..< sub.endIndex) + } /// NOTE: This is a temporary measure to unblock DSL efforts and /// incremental source location tracking. This shouldn't be called from @@ -37,6 +40,9 @@ extension Source { public var isFake: Bool { self == Self.fake } public var isReal: Bool { !isFake } + /// Whether this location covers an empty range. This includes `isFake`. + public var isEmpty: Bool { start == end } + /// Returns the smallest location that contains both this location and /// another. public func union(with other: Location) -> SourceLocation { diff --git a/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift b/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift index bbb6233f1..46a0047e8 100644 --- a/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift +++ b/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift @@ -144,7 +144,53 @@ extension AST.Atom { } extension AST.Atom.Callout: _ASTPrintable { - public var _dumpBase: String { "callout <\(arg.value)>" } + public var _dumpBase: String { + switch self { + case .pcre(let p): return "\(p)" + case .onigurumaNamed(let o): return "\(o)" + case .onigurumaOfContents(let o): return "\(o)" + } + } +} + +extension AST.Atom.Callout.PCRE: _ASTPrintable { + public var _dumpBase: String { + "PCRE callout \(arg.value)" + } +} + +extension AST.Atom.Callout.OnigurumaTag: _ASTPrintable { + public var _dumpBase: String { "[\(name.value)]" } +} + +extension AST.Atom.Callout.OnigurumaNamed.ArgList: _ASTPrintable { + public var _dumpBase: String { + "{\(args.map { $0.value }.joined(separator: ","))}" + } +} + +extension AST.Atom.Callout.OnigurumaNamed: _ASTPrintable { + public var _dumpBase: String { + var result = "named oniguruma callout \(name.value)" + if let tag = tag { + result += "\(tag)" + } + if let args = args { + result += "\(args)" + } + return result + } +} + +extension AST.Atom.Callout.OnigurumaOfContents: _ASTPrintable { + public var _dumpBase: String { + var result = "oniguruma callout of contents {\(contents.value)}" + if let tag = tag { + result += "\(tag)" + } + result += " \(direction.value)" + return result + } } extension AST.Reference: _ASTPrintable { @@ -268,3 +314,24 @@ extension AST.Group.BalancedCapture: _ASTPrintable { "\(name?.value ?? "")-\(priorName.value)" } } + +extension AST.AbsentFunction.Kind { + public var _dumpBase: String { + switch self { + case .repeater: + return "repeater" + case .expression: + return "expression" + case .stopper: + return "stopper" + case .clearer: + return "clearer" + } + } +} + +extension AST.AbsentFunction { + public var _dumpBase: String { + "absent function \(kind._dumpBase)" + } +} diff --git a/Sources/_MatchingEngine/Regex/Printing/PrintAsCanonical.swift b/Sources/_MatchingEngine/Regex/Printing/PrintAsCanonical.swift index 41cfcd4e1..6f1e6cbb7 100644 --- a/Sources/_MatchingEngine/Regex/Printing/PrintAsCanonical.swift +++ b/Sources/_MatchingEngine/Regex/Printing/PrintAsCanonical.swift @@ -87,6 +87,9 @@ extension PrettyPrinter { case let .customCharacterClass(ccc): outputAsCanonical(ccc) + case let .absentFunction(abs): + outputAsCanonical(abs) + case .empty: output("") @@ -126,6 +129,25 @@ extension PrettyPrinter { mutating func outputAsCanonical(_ condition: AST.Conditional.Condition) { output("(/*TODO: conditional \(condition) */)") } + + mutating func outputAsCanonical(_ abs: AST.AbsentFunction) { + output("(?~") + switch abs.kind { + case .repeater(let a): + outputAsCanonical(a) + case .expression(let a, _, let child): + output("|") + outputAsCanonical(a) + output("|") + outputAsCanonical(child) + case .stopper(let a): + output("|") + outputAsCanonical(a) + case .clearer: + output("|") + } + output(")") + } } extension AST.Quote { diff --git a/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift b/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift index 20eb9693b..17ea58de0 100644 --- a/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift +++ b/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift @@ -128,6 +128,9 @@ extension PrettyPrinter { case let .customCharacterClass(ccc): printAsPattern(ccc) + case let .absentFunction(abs): + print("/*TODO: absent function \(abs)*/") + case .empty: print("") case .groupTransform: print("// FIXME: get group transform out of here!") diff --git a/Sources/_StringProcessing/ASTBuilder.swift b/Sources/_StringProcessing/ASTBuilder.swift index f265ed798..afbe19b16 100644 --- a/Sources/_StringProcessing/ASTBuilder.swift +++ b/Sources/_StringProcessing/ASTBuilder.swift @@ -175,8 +175,46 @@ func groupCondition( .group(.init(.init(faking: kind), child, .fake)) } -func callout(_ arg: AST.Atom.Callout.Argument) -> AST { - atom(.callout(.init(.init(faking: arg)))) +func pcreCallout(_ arg: AST.Atom.Callout.PCRE.Argument) -> AST { + atom(.callout(.pcre(.init(.init(faking: arg))))) +} + +func absentRepeater(_ child: AST) -> AST { + .absentFunction(.init(.repeater(child), start: .fake, location: .fake)) +} +func absentExpression(_ absentee: AST, _ child: AST) -> AST { + .absentFunction(.init( + .expression(absentee: absentee, pipe: .fake, expr: child), + start: .fake, location: .fake + )) +} +func absentStopper(_ absentee: AST) -> AST { + .absentFunction(.init(.stopper(absentee), start: .fake, location: .fake)) + +} +func absentRangeClear() -> AST { + .absentFunction(.init(.clearer, start: .fake, location: .fake)) +} + +func onigurumaNamedCallout( + _ name: String, tag: String? = nil, args: String... +) -> AST { + atom(.callout(.onigurumaNamed(.init( + .init(faking: name), + tag: tag.map { .init(.fake, .init(faking: $0), .fake) }, + args: args.isEmpty ? nil : .init(.fake, args.map { .init(faking: $0) }, .fake) + )))) +} + +func onigurumaCalloutOfContents( + _ contents: String, tag: String? = nil, + direction: AST.Atom.Callout.OnigurumaOfContents.Direction = .inProgress +) -> AST { + atom(.callout(.onigurumaOfContents(.init( + .fake, .init(faking: contents), .fake, + tag: tag.map { .init(.fake, .init(faking: $0), .fake) }, + direction: .init(faking: direction) + )))) } func backtrackingDirective( diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 849e8ccb0..02ff8334e 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -93,6 +93,9 @@ class Compiler { case .trivia, .empty: break + case .absentFunction: + throw unsupported(node.renderAsCanonical()) + case .group(let g): if let lookaround = g.lookaroundKind { try emitLookaround(lookaround, g.child) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 1961dae9c..a54bd4c33 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -52,7 +52,7 @@ extension AST { return try ccc.generateConsumer(opts) case .alternation, .conditional, .concatenation, .group, .quantification, .quote, .trivia, .empty, - .groupTransform: return nil + .groupTransform, .absentFunction: return nil } } } diff --git a/Sources/_StringProcessing/Legacy/LegacyCompile.swift b/Sources/_StringProcessing/Legacy/LegacyCompile.swift index 362f98fc3..475a051c4 100644 --- a/Sources/_StringProcessing/Legacy/LegacyCompile.swift +++ b/Sources/_StringProcessing/Legacy/LegacyCompile.swift @@ -260,6 +260,9 @@ func compile( case .conditional: throw unsupported(ast.renderAsCanonical()) + case .absentFunction: + throw unsupported(ast.renderAsCanonical()) + case .customCharacterClass: fatalError("unreachable") diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 30108b6bb..1aa9af18a 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1139,15 +1139,40 @@ extension RegexTests { // MARK: Callouts - parseTest(#"(?C)"#, callout(.number(0))) - parseTest(#"(?C0)"#, callout(.number(0))) - parseTest(#"(?C20)"#, callout(.number(20))) - parseTest("(?C{abc})", callout(.string("abc"))) + // PCRE callouts + + parseTest(#"(?C)"#, pcreCallout(.number(0))) + parseTest(#"(?C0)"#, pcreCallout(.number(0))) + parseTest(#"(?C20)"#, pcreCallout(.number(20))) + parseTest("(?C{abc})", pcreCallout(.string("abc"))) for delim in ["`", "'", "\"", "^", "%", "#", "$"] { - parseTest("(?C\(delim)hello\(delim))", callout(.string("hello"))) + parseTest("(?C\(delim)hello\(delim))", pcreCallout(.string("hello"))) } + // Oniguruma named callouts + + parseTest("(*X)", onigurumaNamedCallout("X")) + parseTest("(*foo[t])", onigurumaNamedCallout("foo", tag: "t")) + parseTest("(*foo[a0]{b})", onigurumaNamedCallout("foo", tag: "a0", args: "b")) + parseTest("(*foo{b})", onigurumaNamedCallout("foo", args: "b")) + parseTest("(*foo[a]{a,b,c})", onigurumaNamedCallout("foo", tag: "a", args: "a", "b", "c")) + parseTest("(*foo{a,b,c})", onigurumaNamedCallout("foo", args: "a", "b", "c")) + parseTest("(*foo{%%$,!!,>>})", onigurumaNamedCallout("foo", args: "%%$", "!!", ">>")) + parseTest("(*foo{a, b, c})", onigurumaNamedCallout("foo", args: "a", " b", " c")) + + // Oniguruma 'of contents' callouts + + parseTest("(?{x})", onigurumaCalloutOfContents("x")) + parseTest("(?{{{x}}y}}})", onigurumaCalloutOfContents("x}}y")) + parseTest("(?{{{x}}})", onigurumaCalloutOfContents("x")) + parseTest("(?{x}[tag])", onigurumaCalloutOfContents("x", tag: "tag")) + parseTest("(?{x}[tag]<)", onigurumaCalloutOfContents("x", tag: "tag", direction: .inRetraction)) + parseTest("(?{x}X)", onigurumaCalloutOfContents("x", direction: .both)) + parseTest("(?{x}>)", onigurumaCalloutOfContents("x")) + parseTest("(?{\\x})", onigurumaCalloutOfContents("\\x")) + parseTest("(?{\\})", onigurumaCalloutOfContents("\\")) + // MARK: Backtracking directives parseTest("(*ACCEPT)?", zeroOrOne(.eager, backtrackingDirective(.accept))) @@ -1164,6 +1189,37 @@ extension RegexTests { parseTest("(*PRUNE)", backtrackingDirective(.prune)) parseTest("(*THEN)", backtrackingDirective(.then)) + // MARK: Oniguruma absent functions + + parseTest("(?~)", absentRepeater(empty())) + parseTest("(?~abc)", absentRepeater(concat("a", "b", "c"))) + parseTest("(?~a+)", absentRepeater(oneOrMore(.eager, "a"))) + parseTest("(?~~)", absentRepeater("~")) + parseTest("(?~a|b|c)", absentRepeater(alt("a", "b", "c"))) + parseTest("(?~(a))", absentRepeater(capture("a")), captures: .empty) + parseTest("(?~)*", zeroOrMore(.eager, absentRepeater(empty()))) + + parseTest("(?~|abc)", absentStopper(concat("a", "b", "c"))) + parseTest("(?~|a+)", absentStopper(oneOrMore(.eager, "a"))) + parseTest("(?~|~)", absentStopper("~")) + parseTest("(?~|(a))", absentStopper(capture("a")), captures: .empty) + parseTest("(?~|a){2}", exactly(.eager, 2, absentStopper("a"))) + + parseTest("(?~|a|b)", absentExpression("a", "b")) + parseTest("(?~|~|~)", absentExpression("~", "~")) + parseTest("(?~|(a)|(?:b))", absentExpression(capture("a"), nonCapture("b")), + captures: .empty) + parseTest("(?~|(a)|(?:(b)|c))", absentExpression( + capture("a"), nonCapture(alt(capture("b"), "c")) + ), captures: .optional(.atom())) + parseTest("(?~|a|b)?", zeroOrOne(.eager, absentExpression("a", "b"))) + + parseTest("(?~|)", absentRangeClear()) + + // TODO: It's not really clear what this means, but Oniguruma parses it... + // Maybe we should diagnose it? + parseTest("(?~|)+", oneOrMore(.eager, absentRangeClear())) + // MARK: Parse with delimiters parseWithDelimitersTest("'/a b/'", concat("a", " ", "b")) @@ -1232,6 +1288,19 @@ extension RegexTests { parseNotEqualTest("(?C0)", "(?C1)") parseNotEqualTest("(?C0)", "(?C'hello')") + parseNotEqualTest("(*X)", "(*Y)") + parseNotEqualTest("(*X[a])", "(*X[b])") + parseNotEqualTest("(*X[a]{a})", "(*X[a]{b})") + parseNotEqualTest("(*X[a]{a})", "(*X[a])") + parseNotEqualTest("(*X{a})", "(*X[a]{a})") + parseNotEqualTest("(*X{a})", "(*X{a,b})") + + parseNotEqualTest("(?{a})", "(?{b})") + parseNotEqualTest("(?{a}[a])", "(?{a}[b])") + parseNotEqualTest("(?{a})", "(?{a}[a])") + parseNotEqualTest("(?{a}X)", "(?{a})") + parseNotEqualTest("(?{a}<)", "(?{a}X)") + parseNotEqualTest("(*ACCEPT)", "(*ACCEPT:a)") parseNotEqualTest("(*MARK:a)", "(*MARK:b)") parseNotEqualTest("(*:a)", "(*:b)") @@ -1240,6 +1309,14 @@ extension RegexTests { parseNotEqualTest("(?)", "(?)") parseNotEqualTest("(?)", "(?)") parseNotEqualTest("(?<-b>)", "(?)") + + parseNotEqualTest("(?~|)", "(?~|a)") + parseNotEqualTest("(?~|a)", "(?~|b)") + parseNotEqualTest("(?~|a)", "(?~|a|)") + parseNotEqualTest("(?~|a|b)", "(?~|a|)") + parseNotEqualTest("(?~|a|b)", "(?~|a|c)") + parseNotEqualTest("(?~)", "(?~|)") + parseNotEqualTest("(?~a)", "(?~b)") } func testParseSourceLocations() throws { @@ -1288,6 +1365,69 @@ extension RegexTests { $0.as(AST.Atom.self)!.as(AST.Reference.self)!.innerLoc }) + // MARK: Callout + + typealias Callout = AST.Atom.Callout + + rangeTest(#"(?C0)"#, range(3 ..< 4), at: { + $0.as(AST.Atom.self)!.as(Callout.self)! + .as(Callout.PCRE.self)!.arg.location + }) + rangeTest(#"(?C)"#, range(3 ..< 3), at: { + $0.as(AST.Atom.self)!.as(Callout.self)! + .as(Callout.PCRE.self)!.arg.location + }) + + rangeTest(#"(*abc[ta]{a,b})"#, range(2 ..< 5), at: { + $0.as(AST.Atom.self)!.as(Callout.self)! + .as(Callout.OnigurumaNamed.self)!.name.location + }) + rangeTest(#"(*abc[ta]{a,b})"#, range(5 ..< 6), at: { + $0.as(AST.Atom.self)!.as(Callout.self)! + .as(Callout.OnigurumaNamed.self)!.tag!.leftBracket + }) + rangeTest(#"(*abc[ta]{a,b})"#, range(8 ..< 9), at: { + $0.as(AST.Atom.self)!.as(Callout.self)! + .as(Callout.OnigurumaNamed.self)!.tag!.rightBracket + }) + rangeTest(#"(*abc[ta]{a,b})"#, range(9 ..< 10), at: { + $0.as(AST.Atom.self)!.as(Callout.self)! + .as(Callout.OnigurumaNamed.self)!.args!.leftBrace + }) + rangeTest(#"(*abc[ta]{a,b})"#, range(12 ..< 13), at: { + $0.as(AST.Atom.self)!.as(Callout.self)! + .as(Callout.OnigurumaNamed.self)!.args!.args[1].location + }) + rangeTest(#"(*abc[ta]{a,b})"#, range(13 ..< 14), at: { + $0.as(AST.Atom.self)!.as(Callout.self)! + .as(Callout.OnigurumaNamed.self)!.args!.rightBrace + }) + + rangeTest(#"(?{{{abc}}}[t]X)"#, range(2 ..< 5), at: { + $0.as(AST.Atom.self)!.as(Callout.self)! + .as(Callout.OnigurumaOfContents.self)!.openBraces + }) + rangeTest(#"(?{{{abc}}}[t]X)"#, range(8 ..< 11), at: { + $0.as(AST.Atom.self)!.as(Callout.self)! + .as(Callout.OnigurumaOfContents.self)!.closeBraces + }) + rangeTest(#"(?{{{abc}}}[t]X)"#, range(11 ..< 12), at: { + $0.as(AST.Atom.self)!.as(Callout.self)! + .as(Callout.OnigurumaOfContents.self)!.tag!.leftBracket + }) + rangeTest(#"(?{{{abc}}}[t]X)"#, range(13 ..< 14), at: { + $0.as(AST.Atom.self)!.as(Callout.self)! + .as(Callout.OnigurumaOfContents.self)!.tag!.rightBracket + }) + rangeTest(#"(?{{{abc}}}[t]X)"#, range(14 ..< 15), at: { + $0.as(AST.Atom.self)!.as(Callout.self)! + .as(Callout.OnigurumaOfContents.self)!.direction.location + }) + rangeTest(#"(?{a})"#, range(5 ..< 5), at: { + $0.as(AST.Atom.self)!.as(Callout.self)! + .as(Callout.OnigurumaOfContents.self)!.direction.location + }) + // MARK: Conditionals rangeTest("(?(1))", entireRange) @@ -1309,12 +1449,22 @@ extension RegexTests { rangeTest("(?(xxx))", range(2 ..< 7), at: { $0.as(AST.Conditional.self)!.condition.location }) + + // MARK: Absent functions + + rangeTest("(?~a)", entireRange) + rangeTest("(?~|)", entireRange) + rangeTest("(?~|a)", entireRange) + rangeTest("(?~|a|b)", entireRange) } func testParseErrors() { - // MARK: Closing delimiters. + // MARK: Unbalanced delimiters. diagnosticTest("(", .expected(")")) + diagnosticTest(")", .unbalancedEndOfGroup) + diagnosticTest(")))", .unbalancedEndOfGroup) + diagnosticTest("())()", .unbalancedEndOfGroup) diagnosticTest(#"\u{5"#, .expected("}")) diagnosticTest(#"\x{5"#, .expected("}")) @@ -1345,10 +1495,10 @@ extension RegexTests { diagnosticTest("(?C", .expected(")")) - diagnosticTest("(?<", .expectedGroupName) + diagnosticTest("(?<", .expectedIdentifier(.groupName)) diagnosticTest("(?")) - diagnosticTest("(?")) diagnosticTest("(?", .expected(")")) @@ -1360,16 +1510,15 @@ extension RegexTests { // MARK: Group specifiers diagnosticTest(#"(*"#, .unknownGroupKind("*")) - diagnosticTest("(*X)", .unknownGroupKind("*X")) diagnosticTest(#"(?k)"#, .unknownGroupKind("?k")) diagnosticTest(#"(?P#)"#, .invalidMatchingOption("#")) - diagnosticTest(#"(?<#>)"#, .groupNameMustBeAlphaNumeric) - diagnosticTest(#"(?'1A')"#, .groupNameCannotStartWithNumber) + diagnosticTest(#"(?<#>)"#, .identifierMustBeAlphaNumeric(.groupName)) + diagnosticTest(#"(?'1A')"#, .identifierCannotStartWithNumber(.groupName)) - diagnosticTest(#"(?'-')"#, .expectedGroupName) - diagnosticTest(#"(?'--')"#, .groupNameMustBeAlphaNumeric) + diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName)) + diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName)) diagnosticTest(#"(?'a-b-c')"#, .expected("'")) // MARK: Matching options @@ -1381,22 +1530,22 @@ extension RegexTests { // MARK: References - diagnosticTest(#"\k''"#, .expectedGroupName) - diagnosticTest(#"(?&)"#, .expectedGroupName) - diagnosticTest(#"(?P>)"#, .expectedGroupName) + diagnosticTest(#"\k''"#, .expectedIdentifier(.groupName)) + diagnosticTest(#"(?&)"#, .expectedIdentifier(.groupName)) + diagnosticTest(#"(?P>)"#, .expectedIdentifier(.groupName)) diagnosticTest(#"\g{0}"#, .cannotReferToWholePattern) diagnosticTest(#"(?(0))"#, .cannotReferToWholePattern) - diagnosticTest(#"(?&&)"#, .groupNameMustBeAlphaNumeric) - diagnosticTest(#"(?&-1)"#, .groupNameMustBeAlphaNumeric) - diagnosticTest(#"(?P>+1)"#, .groupNameMustBeAlphaNumeric) - diagnosticTest(#"(?P=+1)"#, .groupNameMustBeAlphaNumeric) - diagnosticTest(#"\k'#'"#, .groupNameMustBeAlphaNumeric) - diagnosticTest(#"(?&#)"#, .groupNameMustBeAlphaNumeric) + diagnosticTest(#"(?&&)"#, .identifierMustBeAlphaNumeric(.groupName)) + diagnosticTest(#"(?&-1)"#, .identifierMustBeAlphaNumeric(.groupName)) + diagnosticTest(#"(?P>+1)"#, .identifierMustBeAlphaNumeric(.groupName)) + diagnosticTest(#"(?P=+1)"#, .identifierMustBeAlphaNumeric(.groupName)) + diagnosticTest(#"\k'#'"#, .identifierMustBeAlphaNumeric(.groupName)) + diagnosticTest(#"(?&#)"#, .identifierMustBeAlphaNumeric(.groupName)) - diagnosticTest(#"(?P>1)"#, .groupNameCannotStartWithNumber) - diagnosticTest(#"\k{1}"#, .groupNameCannotStartWithNumber) + diagnosticTest(#"(?P>1)"#, .identifierCannotStartWithNumber(.groupName)) + diagnosticTest(#"\k{1}"#, .identifierCannotStartWithNumber(.groupName)) diagnosticTest(#"\g<1-1>"#, .expected(">")) diagnosticTest(#"\g{1-1}"#, .expected("}")) @@ -1414,9 +1563,31 @@ extension RegexTests { // MARK: Callouts + // PCRE callouts diagnosticTest("(?C-1)", .unknownCalloutKind("(?C-1)")) diagnosticTest("(?C-1", .unknownCalloutKind("(?C-1)")) + // Oniguruma named callouts + diagnosticTest("(*bar[", .expectedIdentifier(.onigurumaCalloutTag)) + diagnosticTest("(*bar[%", .identifierMustBeAlphaNumeric(.onigurumaCalloutTag)) + diagnosticTest("(*bar{", .expectedCalloutArgument) + diagnosticTest("(*bar}", .expected(")")) + diagnosticTest("(*bar]", .expected(")")) + + // Oniguruma 'of contents' callouts + diagnosticTest("(?{", .expected("}")) + diagnosticTest("(?{}", .expectedNonEmptyContents) + diagnosticTest("(?{x}", .expected(")")) + diagnosticTest("(?{x}}", .expected(")")) + diagnosticTest("(?{{x}}", .expected(")")) + diagnosticTest("(?{{x}", .expected("}")) + diagnosticTest("(?{x}[", .expectedIdentifier(.onigurumaCalloutTag)) + diagnosticTest("(?{x}[%", .identifierMustBeAlphaNumeric(.onigurumaCalloutTag)) + diagnosticTest("(?{x}[a]", .expected(")")) + diagnosticTest("(?{x}[a]K", .expected(")")) + diagnosticTest("(?{x}[a]X", .expected(")")) + diagnosticTest("(?{{x}y}", .expected("}")) + // MARK: Backtracking directives diagnosticTest("(*MARK)", .backtrackingDirectiveMustHaveName("MARK")) @@ -1428,5 +1599,12 @@ extension RegexTests { diagnosticTest("(*SKIP:a)*?", .notQuantifiable) diagnosticTest("(*F)+?", .notQuantifiable) diagnosticTest("(*:a){2}", .notQuantifiable) + + // MARK: Oniguruma absent functions + + diagnosticTest("(?~", .expected(")")) + diagnosticTest("(?~|", .expected(")")) + diagnosticTest("(?~|a|b|c)", .tooManyAbsentExpressionChildren(3)) + diagnosticTest("(?~||||)", .tooManyAbsentExpressionChildren(4)) } }