From 73980e1be049353aa1217dc6cd155a9813b2e0a5 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 31 Jan 2022 11:29:39 +0000 Subject: [PATCH 1/3] Move SyntaxOptions from Source to ParserContext And plumb through `context` parameters for more lexing methods. This is necessary as `(?x)` will be able to affect the syntax options set during parsing. --- .../Regex/Parse/LexicalAnalysis.swift | 50 ++++++++++++++----- .../_MatchingEngine/Regex/Parse/Parse.swift | 49 ++++++++++-------- .../_MatchingEngine/Regex/Parse/Source.swift | 16 +----- Tests/RegexTests/LexTests.swift | 2 +- 4 files changed, 68 insertions(+), 49 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index 18b536005..afefbf5d9 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -332,7 +332,7 @@ extension Source { /// Quantifier -> ('*' | '+' | '?' | '{' Range '}') QuantKind? /// QuantKind -> '?' | '+' /// - mutating func lexQuantifier() throws -> ( + mutating func lexQuantifier(context: ParsingContext) throws -> ( Located, Located )? { let amt: Located? = try recordLoc { src in @@ -341,7 +341,9 @@ extension Source { if src.tryEat("?") { return .zeroOrOne } return try src.tryEating { src in - guard src.tryEat("{"), let range = try src.lexRange(), src.tryEat("}") + guard src.tryEat("{"), + let range = try src.lexRange(context: context), + src.tryEat("}") else { return nil } return range.value } @@ -363,7 +365,7 @@ extension Source { /// | ExpRange /// ExpRange -> '..<' | '...' /// | '..<' | '...' ? - mutating func lexRange() throws -> Located? { + mutating func lexRange(context: ParsingContext) throws -> Located? { try recordLoc { src in try src.tryEating { src in let lowerOpt = try src.lexNumber() @@ -375,7 +377,7 @@ extension Source { let closedRange: Bool? if src.tryEat(",") { closedRange = true - } else if src.experimentalRanges && src.tryEat(".") { + } else if context.experimentalRanges && src.tryEat(".") { try src.expect(".") if src.tryEat(".") { closedRange = true @@ -477,12 +479,12 @@ extension Source { /// /// TODO: Need to support some escapes /// - mutating func lexQuote() throws -> AST.Quote? { + mutating func lexQuote(context: ParsingContext) throws -> AST.Quote? { let str = try recordLoc { src -> String? in if src.tryEat(sequence: #"\Q"#) { return try src.expectQuoted(endingWith: #"\E"#).value } - if src.experimentalQuotes, src.tryEat("\"") { + if context.experimentalQuotes, src.tryEat("\"") { return try src.expectQuoted(endingWith: "\"", ignoreEscaped: true).value } return nil @@ -501,12 +503,12 @@ extension Source { /// /// TODO: Swift-style nested comments, line-ending comments, etc /// - mutating func lexComment() throws -> AST.Trivia? { + mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? { let trivia: Located? = try recordLoc { src in if src.tryEat(sequence: "(?#") { return try src.expectQuoted(endingWith: ")").value } - if src.experimentalComments, src.tryEat(sequence: "/*") { + if context.experimentalComments, src.tryEat(sequence: "/*") { return try src.expectQuoted(endingWith: "*/").value } return nil @@ -517,9 +519,13 @@ extension Source { /// Try to consume non-semantic whitespace as trivia /// + /// Whitespace -> ' '+ + /// /// Does nothing unless `SyntaxOptions.nonSemanticWhitespace` is set - mutating func lexNonSemanticWhitespace() throws -> AST.Trivia? { - guard syntax.ignoreWhitespace else { return nil } + mutating func lexNonSemanticWhitespace( + context: ParsingContext + ) throws -> AST.Trivia? { + guard context.ignoreWhitespace else { return nil } let trivia: Located? = recordLoc { src in src.tryEatPrefix { $0 == " " }?.string } @@ -527,6 +533,20 @@ extension Source { return AST.Trivia(trivia) } + /// Try to consume trivia. + /// + /// Trivia -> Comment | Whitespace + /// + mutating func lexTrivia(context: ParsingContext) throws -> AST.Trivia? { + if let comment = try lexComment(context: context) { + return comment + } + if let whitespace = try lexNonSemanticWhitespace(context: context) { + return whitespace + } + return nil + } + /// Try to lex a matching option. /// /// MatchingOption -> 'i' | 'J' | 'm' | 'n' | 's' | 'U' | 'x' | 'xx' | 'w' @@ -761,6 +781,7 @@ extension Source { /// comments, like quotes, cannot be quantified. /// mutating func lexGroupStart( + context: ParsingContext ) throws -> Located? { try recordLoc { src in try src.tryEating { src in @@ -825,7 +846,7 @@ extension Source { } // (_:) - if src.experimentalCaptures && src.tryEat(sequence: "_:") { + if context.experimentalCaptures && src.tryEat(sequence: "_:") { return .nonCapture } // TODO: (name:) @@ -960,9 +981,12 @@ extension Source { /// /// GroupConditionalStart -> '(?' GroupStart /// - mutating func lexGroupConditionalStart() throws -> Located? { + mutating func lexGroupConditionalStart( + context: ParsingContext + ) throws -> Located? { try tryEating { src in - guard src.tryEat(sequence: "(?"), let group = try src.lexGroupStart() + guard src.tryEat(sequence: "(?"), + let group = try src.lexGroupStart(context: context) else { return nil } // Implicitly scoped groups are not supported here. diff --git a/Sources/_MatchingEngine/Regex/Parse/Parse.swift b/Sources/_MatchingEngine/Regex/Parse/Parse.swift index 08c8cf77e..4d6221e24 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Parse.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Parse.swift @@ -53,14 +53,17 @@ Lexical analysis provides the following: struct ParsingContext { /// Whether we're currently parsing in a custom character class. - var isInCustomCharacterClass = false + fileprivate(set) var isInCustomCharacterClass = false /// Tracks the number of group openings we've seen, to disambiguate the '\n' /// syntax as a backreference or an octal sequence. - fileprivate var priorGroupCount = 0 + private var priorGroupCount = 0 /// A set of used group names. - fileprivate var usedGroupNames = Set() + private var usedGroupNames = Set() + + /// The syntax options currently set. + fileprivate(set) var syntax: SyntaxOptions fileprivate mutating func recordGroup(_ g: AST.Group.Kind) { // TODO: Needs to track group number resets (?|...). @@ -70,8 +73,9 @@ struct ParsingContext { } } - private init() {} - static var none: ParsingContext { .init() } + init(syntax: SyntaxOptions) { + self.syntax = syntax + } /// Check whether a given reference refers to a prior group. func isPriorGroupRef(_ ref: AST.Reference.Kind) -> Bool { @@ -88,13 +92,22 @@ struct ParsingContext { private struct Parser { var source: Source - var context: ParsingContext = .none + var context: ParsingContext - init(_ source: Source) { + init(_ source: Source, syntax: SyntaxOptions) { self.source = source + self.context = ParsingContext(syntax: syntax) } } +extension ParsingContext { + var experimentalRanges: Bool { syntax.contains(.experimentalRanges) } + var experimentalCaptures: Bool { syntax.contains(.experimentalCaptures) } + var experimentalQuotes: Bool { syntax.contains(.experimentalQuotes) } + var experimentalComments: Bool { syntax.contains(.experimentalComments) } + var ignoreWhitespace: Bool { syntax.contains(.nonSemanticWhitespace) } +} + // Diagnostics extension Parser { mutating func report( @@ -182,24 +195,20 @@ extension Parser { // TODO: refactor loop body into function let _start = source.currentPosition - // Trivia -> `lexComment` | `lexNonSemanticWhitespace` - if let triv = try source.lexComment() { - result.append(.trivia(triv)) - continue - } - if let triv = try source.lexNonSemanticWhitespace() { + // Trivia -> `lexTrivia` + if let triv = try source.lexTrivia(context: context) { result.append(.trivia(triv)) continue } // Quote -> `lexQuote` - if let quote = try source.lexQuote() { + if let quote = try source.lexQuote(context: context) { result.append(.quote(quote)) continue } // Quantification -> QuantOperand Quantifier? if let operand = try parseQuantifierOperand() { - if let (amt, kind) = try source.lexQuantifier() { + if let (amt, kind) = try source.lexQuantifier(context: context) { let location = loc(_start) guard operand.isQuantifiable else { throw Source.LocatedError(ParseError.notQuantifiable, location) @@ -333,7 +342,7 @@ extension Parser { if let cond = try source.lexKnownConditionalStart(context: context) { return try parseConditionalBranches(start: _start, cond) } - if let kind = try source.lexGroupConditionalStart() { + if let kind = try source.lexGroupConditionalStart(context: context) { let groupStart = kind.location.start let group = try parseGroupBody(start: groupStart, kind) return try parseConditionalBranches( @@ -346,7 +355,7 @@ extension Parser { } // Check if we have the start of a group '('. - if let kind = try source.lexGroupStart() { + if let kind = try source.lexGroupStart(context: context) { return .group(try parseGroupBody(start: _start, kind)) } @@ -435,7 +444,7 @@ extension Parser { } // Quoted sequence. - if let quote = try source.lexQuote() { + if let quote = try source.lexQuote(context: context) { members.append(.quote(quote)) continue } @@ -463,8 +472,8 @@ public func parse( _ regex: S, _ syntax: SyntaxOptions ) throws -> AST where S.SubSequence == Substring { - let source = Source(String(regex), syntax) - var parser = Parser(source) + let source = Source(String(regex)) + var parser = Parser(source, syntax: syntax) return try parser.parse() } diff --git a/Sources/_MatchingEngine/Regex/Parse/Source.swift b/Sources/_MatchingEngine/Regex/Parse/Source.swift index 260ee3b63..11bd8152f 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Source.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Source.swift @@ -17,15 +17,13 @@ public struct Source { var input: Input var bounds: Range - var syntax: SyntaxOptions // TODO: source should hold outer collection and range, at least // for error reporting if nothing else - init(_ str: Input, _ syntax: SyntaxOptions) { + init(_ str: Input) { self.input = str self.bounds = str.startIndex ..< str.endIndex - self.syntax = syntax } subscript(_ range: Range) -> Input.SubSequence { input[range] } @@ -43,18 +41,6 @@ extension Source { public typealias Position = String.Index } -// MARK: - Syntax - -extension Source { - var experimentalRanges: Bool { syntax.contains(.experimentalRanges) } - var experimentalCaptures: Bool { syntax.contains(.experimentalCaptures) } - var experimentalQuotes: Bool { syntax.contains(.experimentalQuotes) } - var experimentalComments: Bool { syntax.contains(.experimentalComments) } - var nonSemanticWhitespace: Bool { - syntax.contains(.nonSemanticWhitespace) - } -} - // MARK: - Source as a peekable consumer extension Source { diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index e908a11da..ce82f8b8f 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -22,7 +22,7 @@ func diagnose( file: StaticString = #file, line: UInt = #line ) { - var src = Source(input, syntax) + var src = Source(input) do { try f(&src) XCTFail(""" From 7d139cd360f5f8602defe8ebd27389e42954de6b Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 31 Jan 2022 11:29:40 +0000 Subject: [PATCH 2/3] Parse extended syntax If the `(?x)` or `(?xx)` options are active in a given scope, treat whitespace as non-semantic, including in custom character classes. Additionally parse end-of-line comments `# comment`. --- .../Regex/AST/CustomCharClass.swift | 19 +++ .../Regex/AST/MatchingOptions.swift | 19 ++- .../Regex/AST/Quantification.swift | 9 +- .../Regex/Parse/LexicalAnalysis.swift | 57 ++++++- .../_MatchingEngine/Regex/Parse/Parse.swift | 37 +++- .../Regex/Parse/SyntaxOptions.swift | 23 ++- .../Regex/Printing/DumpAST.swift | 10 +- .../Regex/Printing/PrintAsCanonical.swift | 13 +- .../Regex/Printing/PrintAsPattern.swift | 3 + .../_StringProcessing/CharacterClass.swift | 4 + .../_StringProcessing/ConsumerInterface.swift | 30 +++- Tests/RegexTests/ParseTests.swift | 161 ++++++++++++++++++ 12 files changed, 359 insertions(+), 26 deletions(-) diff --git a/Sources/_MatchingEngine/Regex/AST/CustomCharClass.swift b/Sources/_MatchingEngine/Regex/AST/CustomCharClass.swift index 3044d45bd..614048f0a 100644 --- a/Sources/_MatchingEngine/Regex/AST/CustomCharClass.swift +++ b/Sources/_MatchingEngine/Regex/AST/CustomCharClass.swift @@ -42,6 +42,9 @@ extension AST { /// the contents should be interpreted literally. case quote(Quote) + /// Trivia such as non-semantic whitespace. + case trivia(Trivia) + /// A binary operator applied to sets of members `abc&&def` case setOperation([Member], Located, [Member]) } @@ -81,6 +84,7 @@ extension CustomCC.Member { case .range(let r): return r case .atom(let a): return a case .quote(let q): return q + case .trivia(let t): return t case .setOperation(let lhs, let op, let rhs): return (lhs, op, rhs) } } @@ -88,4 +92,19 @@ extension CustomCC.Member { func `as`(_ t: T.Type = T.self) -> T? { _associatedValue as? T } + + public var isTrivia: Bool { + if case .trivia = self { return true } + return false + } +} + +extension AST.CustomCharacterClass { + /// Strip trivia from the character class members. This does not recurse into + /// nested custom character classes. + public var strippingTriviaShallow: Self { + var copy = self + copy.members = copy.members.filter { !$0.isTrivia } + return copy + } } diff --git a/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift b/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift index cd1c08e0f..f2b86d032 100644 --- a/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift +++ b/Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift @@ -50,6 +50,16 @@ extension AST { self.location = location } + /// If this is either the regular or extra extended syntax option. + public var isAnyExtended: Bool { + switch kind { + case .extended, .extraExtended: + return true + default: + return false + } + } + public var isTextSegmentMode: Bool { switch kind { case .textSegmentGraphemeMode, .textSegmentWordMode: @@ -93,6 +103,10 @@ extension AST { self.minusLoc = minusLoc self.removing = removing } + + /// Whether this set of matching options first resets the options before + /// adding onto them. + public var resetsCurrentOptions: Bool { caretLoc != nil } } } @@ -102,7 +116,10 @@ extension AST.MatchingOption: _ASTPrintable { extension AST.MatchingOptionSequence: _ASTPrintable { public var _dumpBase: String { - "adding: \(adding), removing: \(removing), hasCaret: \(caretLoc != nil)" + """ + adding: \(adding), removing: \(removing), \ + resetsCurrentOptions: \(resetsCurrentOptions) + """ } } diff --git a/Sources/_MatchingEngine/Regex/AST/Quantification.swift b/Sources/_MatchingEngine/Regex/AST/Quantification.swift index 941794935..f2189cb38 100644 --- a/Sources/_MatchingEngine/Regex/AST/Quantification.swift +++ b/Sources/_MatchingEngine/Regex/AST/Quantification.swift @@ -17,16 +17,23 @@ extension AST { public let child: AST.Node public let location: SourceLocation + /// Any trivia intermixed between the operand and the quantifier, as well + /// as between the quantifier characters themselves. This can occur in + /// extended syntax mode where PCRE permits e.g `x * +`. + public let trivia: [AST.Trivia] + public init( _ amount: Located, _ kind: Located, _ child: AST.Node, - _ r: SourceLocation + _ r: SourceLocation, + trivia: [AST.Trivia] ) { self.amount = amount self.kind = kind self.child = child self.location = r + self.trivia = trivia } @frozen diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift index afefbf5d9..9595dc420 100644 --- a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift @@ -332,9 +332,15 @@ extension Source { /// Quantifier -> ('*' | '+' | '?' | '{' Range '}') QuantKind? /// QuantKind -> '?' | '+' /// - mutating func lexQuantifier(context: ParsingContext) throws -> ( - Located, Located - )? { + mutating func lexQuantifier( + context: ParsingContext + ) throws -> (Located, Located, [AST.Trivia])? { + var trivia: [AST.Trivia] = [] + + if let t = try lexNonSemanticWhitespace(context: context) { + trivia.append(t) + } + let amt: Located? = try recordLoc { src in if src.tryEat("*") { return .zeroOrMore } if src.tryEat("+") { return .oneOrMore } @@ -350,13 +356,18 @@ extension Source { } guard let amt = amt else { return nil } + // PCRE allows non-semantic whitespace here in extended syntax mode. + if let t = try lexNonSemanticWhitespace(context: context) { + trivia.append(t) + } + let kind: Located = recordLoc { src in if src.tryEat("?") { return .reluctant } if src.tryEat("+") { return .possessive } return .eager } - return (amt, kind) + return (amt, kind, trivia) } /// Try to consume a range, returning `nil` if unsuccessful. @@ -501,6 +512,10 @@ extension Source { /// /// ExpComment -> '/*' (!'*/' .)* '*/' /// + /// With `SyntaxOptions.endOfLineComments` + /// + /// EndOfLineComment -> '#' .* + /// /// TODO: Swift-style nested comments, line-ending comments, etc /// mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? { @@ -511,6 +526,13 @@ extension Source { if context.experimentalComments, src.tryEat(sequence: "/*") { return try src.expectQuoted(endingWith: "*/").value } + if context.endOfLineComments, src.tryEat("#") { + // TODO: If we ever support multi-line regex literals, this will need + // to be updated to stop at a newline. Note though that PCRE specifies + // that the newline it matches against can be controlled by the global + // matching options e.g `(*CR)`, `(*ANY)`, ... + return src.lexUntil(\.isEmpty).value + } return nil } guard let trivia = trivia else { return nil } @@ -519,15 +541,36 @@ extension Source { /// Try to consume non-semantic whitespace as trivia /// - /// Whitespace -> ' '+ + /// Whitespace -> WhitespaceChar+ /// /// Does nothing unless `SyntaxOptions.nonSemanticWhitespace` is set mutating func lexNonSemanticWhitespace( context: ParsingContext ) throws -> AST.Trivia? { guard context.ignoreWhitespace else { return nil } + + func isWhitespace(_ c: Character) -> Bool { + // This is a list of characters that PCRE treats as whitespace when + // compiled with Unicode support. It is a subset of the characters with + // the `.isWhitespace` property. ICU appears to also follow this list. + // Oniguruma and .NET follow a subset of this list. + // + // FIXME: PCRE only treats space and tab characters as whitespace when + // inside a custom character class (and only treats whitespace as + // non-semantic there for the extra-extended `(?xx)` mode). If we get a + // strict-PCRE mode, we'll need to add a case for that. + switch c { + case " ", "\u{9}"..."\u{D}", // space, \t, \n, vertical tab, \f, \r + "\u{85}", "\u{200E}", // next line, left-to-right mark + "\u{200F}", "\u{2028}", // right-to-left-mark, line separator + "\u{2029}": // paragraph separator + return true + default: + return false + } + } let trivia: Located? = recordLoc { src in - src.tryEatPrefix { $0 == " " }?.string + src.tryEatPrefix(isWhitespace)?.string } guard let trivia = trivia else { return nil } return AST.Trivia(trivia) @@ -1631,7 +1674,7 @@ extension Source { var name: Located? if src.tryEat(":") { // TODO: PCRE allows escaped delimiters or '\Q...\E' sequences in the - // name under PCRE2_ALT_VERBNAMES. + // name under PCRE2_ALT_VERBNAMES. It also allows whitespace under (?x). name = try src.expectQuoted(endingWith: ")", eatEnding: false) } try src.expect(")") diff --git a/Sources/_MatchingEngine/Regex/Parse/Parse.swift b/Sources/_MatchingEngine/Regex/Parse/Parse.swift index 4d6221e24..84c703068 100644 --- a/Sources/_MatchingEngine/Regex/Parse/Parse.swift +++ b/Sources/_MatchingEngine/Regex/Parse/Parse.swift @@ -106,6 +106,7 @@ extension ParsingContext { var experimentalQuotes: Bool { syntax.contains(.experimentalQuotes) } var experimentalComments: Bool { syntax.contains(.experimentalComments) } var ignoreWhitespace: Bool { syntax.contains(.nonSemanticWhitespace) } + var endOfLineComments: Bool { syntax.contains(.endOfLineComments) } } // Diagnostics @@ -208,12 +209,14 @@ extension Parser { } // Quantification -> QuantOperand Quantifier? if let operand = try parseQuantifierOperand() { - if let (amt, kind) = try source.lexQuantifier(context: context) { + if let (amt, kind, trivia) = + try source.lexQuantifier(context: context) { let location = loc(_start) guard operand.isQuantifiable else { throw Source.LocatedError(ParseError.notQuantifiable, location) } - result.append(.quantification(.init(amt, kind, operand, location))) + result.append(.quantification( + .init(amt, kind, operand, location, trivia: trivia))) } else { result.append(operand) } @@ -270,6 +273,28 @@ extension Parser { ) throws -> AST.Group { context.recordGroup(kind.value) + // Check if we're introducing or removing extended syntax. + // TODO: PCRE differentiates between (?x) and (?xx) where only the latter + // handles non-semantic whitespace in a custom character class. Other + // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, + // treat (?x) and (?xx) as the same option here. If we ever get a strict + // PCRE mode, we will need to change this to handle that. + let currentSyntax = context.syntax + if case .changeMatchingOptions(let c, isIsolated: _) = kind.value { + if c.resetsCurrentOptions { + context.syntax.remove(.extendedSyntax) + } + if c.adding.contains(where: \.isAnyExtended) { + context.syntax.insert(.extendedSyntax) + } + if c.removing.contains(where: \.isAnyExtended) { + context.syntax.remove(.extendedSyntax) + } + } + defer { + context.syntax = currentSyntax + } + let child = try parseNode() // An implicit scoped group has already consumed its closing paren. if !kind.value.hasImplicitScope { @@ -449,6 +474,14 @@ extension Parser { continue } + // Lex non-semantic whitespace if we're allowed. + // TODO: ICU allows end-of-line comments in custom character classes, + // which we ought to support if we want to support multi-line regex. + if let trivia = try source.lexNonSemanticWhitespace(context: context) { + members.append(.trivia(trivia)) + continue + } + guard let atom = try source.lexAtom(context: context) else { break } // Range between atoms. diff --git a/Sources/_MatchingEngine/Regex/Parse/SyntaxOptions.swift b/Sources/_MatchingEngine/Regex/Parse/SyntaxOptions.swift index 457232da5..5135d8ec1 100644 --- a/Sources/_MatchingEngine/Regex/Parse/SyntaxOptions.swift +++ b/Sources/_MatchingEngine/Regex/Parse/SyntaxOptions.swift @@ -23,11 +23,19 @@ public struct SyntaxOptions: OptionSet { /// `'a \. b' == '/a\.b/'` public static var nonSemanticWhitespace: Self { Self(1 << 0) } + /// `abc # comment` + public static var endOfLineComments: Self { Self(1 << 1) } + + /// `(?x)` `(?xx)` + public static var extendedSyntax: Self { + [.endOfLineComments, .nonSemanticWhitespace] + } + /// `'a "." b' == '/a\Q.\Eb/'` /// /// NOTE: Currently, this means we have raw quotes. /// Better would be to have real Swift string delimiter parsing logic. - public static var experimentalQuotes: Self { Self(1 << 1) } + public static var experimentalQuotes: Self { Self(1 << 2) } /// `'a /* comment */ b' == '/a(?#. comment )b/'` /// @@ -35,7 +43,7 @@ public struct SyntaxOptions: OptionSet { /// Traditional comments can't have `)`, not even escaped in them either, we /// can. Traditional comments can have `*/` in them, we can't without /// escaping. We don't currently do escaping. - public static var experimentalComments: Self { Self(1 << 2) } + public static var experimentalComments: Self { Self(1 << 3) } /// ``` /// 'a{n...m}' == '/a{n,m}/' @@ -44,11 +52,11 @@ public struct SyntaxOptions: OptionSet { /// 'a{...m}' == '/a{,m}/' /// 'a{...*)` /// `(_: .*)` == `(?:.*)` - public static var experimentalCaptures: Self { Self(1 << 4) } + public static var experimentalCaptures: Self { Self(1 << 5) } /* @@ -59,10 +67,9 @@ public struct SyntaxOptions: OptionSet { public static var traditional: Self { Self(0) } - public static var experimental: Self { Self(~0) } - - public var ignoreWhitespace: Bool { - contains(.nonSemanticWhitespace) + public static var experimental: Self { + // Experimental syntax enables everything except end-of-line comments. + Self(~0).subtracting(.endOfLineComments) } // TODO: Probably want to model strict-PCRE etc. options too. diff --git a/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift b/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift index a130fb5a0..47142407a 100644 --- a/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift +++ b/Sources/_MatchingEngine/Regex/Printing/DumpAST.swift @@ -52,6 +52,9 @@ extension _ASTPrintable { if $0.isTrivia { return nil } return $0._dump() }.joined(separator: ",") + if sub.isEmpty { + return "\(_dumpBase)" + } return "\(_dumpBase)(\(sub))" } } @@ -287,7 +290,11 @@ extension AST.Quantification: _ASTPrintable { extension AST.CustomCharacterClass: _ASTNode { public var _dumpBase: String { - "customCharacterClass(\(members))" + // Exclude trivia for now, as we don't want it to appear when performing + // comparisons of dumped output in tests. + // TODO: We should eventually have some way of filtering out trivia for + // tests, so that it can appear in regular dumps. + return "customCharacterClass(\(strippingTriviaShallow.members))" } } @@ -298,6 +305,7 @@ extension AST.CustomCharacterClass.Member: _ASTPrintable { case .atom(let a): return "\(a)" case .range(let r): return "\(r)" case .quote(let q): return "\(q)" + case .trivia(let t): return "\(t)" case .setOperation(let lhs, let op, let rhs): return "op \(lhs) \(op.value) \(rhs)" } diff --git a/Sources/_MatchingEngine/Regex/Printing/PrintAsCanonical.swift b/Sources/_MatchingEngine/Regex/Printing/PrintAsCanonical.swift index b6f0759b2..4888975f3 100644 --- a/Sources/_MatchingEngine/Regex/Printing/PrintAsCanonical.swift +++ b/Sources/_MatchingEngine/Regex/Printing/PrintAsCanonical.swift @@ -91,9 +91,7 @@ extension PrettyPrinter { output(q._canonicalBase) case let .trivia(t): - // TODO: We might want to output comments... - _ = t - output("") + output(t._canonicalBase) case let .atom(a): output(a._canonicalBase) @@ -135,6 +133,8 @@ extension PrettyPrinter { output(a._canonicalBase) case .quote(let q): output(q._canonicalBase) + case .trivia(let t): + output(t._canonicalBase) case .setOperation: output("/* TODO: set operation \(self) */") } @@ -315,3 +315,10 @@ extension AST.GlobalMatchingOption.Kind { extension AST.GlobalMatchingOption { var _canonicalBase: String { "(*\(kind._canonicalBase))"} } + +extension AST.Trivia { + var _canonicalBase: String { + // TODO: We might want to output comments... + "" + } +} diff --git a/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift b/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift index 8a6367af6..5f00986c0 100644 --- a/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift +++ b/Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift @@ -180,6 +180,9 @@ extension PrettyPrinter { } case .quote(let q): print("// TODO: quote \(q.literal._quoted) in custom character classes (should we split it?)") + case .trivia(let t): + // TODO: We might want to output comments... + _ = t case .setOperation: print("// TODO: Set operation: \(member)") } diff --git a/Sources/_StringProcessing/CharacterClass.swift b/Sources/_StringProcessing/CharacterClass.swift index e1e7dbe9c..92b2a76b9 100644 --- a/Sources/_StringProcessing/CharacterClass.swift +++ b/Sources/_StringProcessing/CharacterClass.swift @@ -420,6 +420,10 @@ extension AST.CustomCharacterClass { // Decompose quoted literal into literal characters. result += q.literal.map { .character($0) } + case .trivia: + // Not semantically important. + break + case .setOperation(let lhs, let op, let rhs): // FIXME: CharacterClass wasn't designed for set operations with // multiple components in each operand, we should fix that. For now, diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 87a910279..a444368c3 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -26,10 +26,31 @@ struct Unsupported: Error, CustomStringConvertible { func unsupported( _ s: String, file: StaticString = #file, - line: UInt = #line + line: Int = #line ) -> Unsupported { return Unsupported( - message: s, file: String(describing: file), line: Int(line)) + message: s, file: String(describing: file), line: line) +} + +struct Unreachable: Error, CustomStringConvertible { + var message: String + var file: String + var line: Int + + var description: String { """ + Unreachable: '\(message)' + \(file):\(line) + """ + } +} + +func unreachable( + _ s: String, + file: StaticString = #file, + line: Int = #line +) -> Unreachable { + return Unreachable( + message: s, file: String(describing: file), line: line) } extension AST.Node { @@ -166,6 +187,9 @@ extension AST.CustomCharacterClass.Member { return nil } + case .trivia: + throw unreachable("Should have been stripped by caller") + case .setOperation(let lhs, let op, let rhs): // TODO: We should probably have a component type // instead of a members array... for now we reconstruct @@ -216,7 +240,7 @@ extension AST.CustomCharacterClass { _ opts: MatchingOptions ) throws -> Program.ConsumeFunction { // NOTE: Easy way to implement, obviously not performant - let consumers = try members.map { + let consumers = try strippingTriviaShallow.members.map { try $0.generateConsumer(opts) } return { input, bounds in diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 4c17bc5dc..1346835fb 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1272,11 +1272,167 @@ extension RegexTests { parseTest("[(*CR)]", charClass("(", "*", "C", "R", ")")) + // MARK: Trivia + + parseTest("[(?#abc)]", charClass("(", "?", "#", "a", "b", "c", ")")) + parseTest("# abc", concat("#", " ", "a", "b", "c")) + + parseTest("(?x) # hello", changeMatchingOptions(matchingOptions( + adding: .extended), isIsolated: true, empty())) + parseTest("(?xx) # hello", changeMatchingOptions(matchingOptions( + adding: .extraExtended), isIsolated: true, empty())) + parseTest("(?x) \\# abc", changeMatchingOptions(matchingOptions( + adding: .extended), isIsolated: true, concat("#", "a", "b", "c"))) + parseTest("(?xx) \\ ", changeMatchingOptions(matchingOptions( + adding: .extraExtended), isIsolated: true, concat(" "))) + + // End of line comments aren't applicable in custom char classes. + // TODO: ICU supports this. + parseTest( + "(?x)[ # abc]", changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + charClass("#", "a", "b", "c")) + ) + + parseTest( + "(?x)a b c[d e f]", changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + concat("a", "b", "c", charClass("d", "e", "f"))) + ) + parseTest( + "(?xx)a b c[d e f]", changeMatchingOptions( + matchingOptions(adding: .extraExtended), isIsolated: true, + concat("a", "b", "c", charClass("d", "e", "f"))) + ) + parseTest( + "(?x)a b c(?-x)d e f", changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + concat("a", "b", "c", + changeMatchingOptions(matchingOptions(removing: .extended), + isIsolated: true, concat("d", " ", "e", " ", "f")))) + ) + parseTest( + "(?x)a b c(?-xx)d e f", changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + concat("a", "b", "c", + changeMatchingOptions(matchingOptions(removing: .extraExtended), + isIsolated: true, concat("d", " ", "e", " ", "f")))) + ) + parseTest( + "(?xx)a b c(?-x)d e f", changeMatchingOptions( + matchingOptions(adding: .extraExtended), isIsolated: true, + concat("a", "b", "c", + changeMatchingOptions(matchingOptions(removing: .extended), + isIsolated: true, concat("d", " ", "e", " ", "f")))) + ) + parseTest( + "(?x)a b c(?^i)d e f", changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + concat("a", "b", "c", + changeMatchingOptions(unsetMatchingOptions(adding: .caseInsensitive), + isIsolated: true, concat("d", " ", "e", " ", "f")))) + ) + parseTest( + "(?x)a b c(?^x)d e f", changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + concat("a", "b", "c", + changeMatchingOptions(unsetMatchingOptions(adding: .extended), + isIsolated: true, concat("d", "e", "f")))) + ) + parseTest( + "(?:(?x)a b c)d e f", concat(nonCapture(changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + concat("a", "b", "c"))), "d", " ", "e", " ", "f") + ) + parseTest( + "(?x:a b c)# hi", concat(changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: false, + concat("a", "b", "c")), "#", " ", "h", "i") + ) + + parseTest( + "(?x-x)a b c", changeMatchingOptions( + matchingOptions(adding: .extended, removing: .extended), isIsolated: true, + concat("a", " ", "b", " ", "c")) + ) + parseTest( + "(?xxx-x)a b c", changeMatchingOptions( + matchingOptions(adding: .extraExtended, .extended, removing: .extended), isIsolated: true, + concat("a", " ", "b", " ", "c")) + ) + parseTest( + "(?xx-i)a b c", changeMatchingOptions( + matchingOptions(adding: .extraExtended, removing: .caseInsensitive), isIsolated: true, + concat("a", "b", "c")) + ) + + // PCRE states that whitespace seperating quantifiers is permitted under + // extended syntax http://pcre.org/current/doc/html/pcre2api.html#SEC20 + parseTest( + "(?x)a *", + changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + zeroOrMore(.eager, "a")) + ) + parseTest( + "(?x)a + ?", + changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + oneOrMore(.reluctant, "a")) + ) + parseTest( + "(?x)a {2,4}", + changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + quantRange(.eager, 2 ... 4, "a")) + ) + + // PCRE states that whitespace won't be ignored within a range. + // http://pcre.org/current/doc/html/pcre2api.html#SEC20 + // TODO: We ought to warn on this, and produce a range anyway. + parseTest( + "(?x)a{1, 3}", + changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + concat("a", "{", "1", ",", "3", "}")) + ) + + // Test that we cover the list of whitespace characters covered by PCRE. + parseTest( + "(?x)a\t\u{A}\u{B}\u{C}\u{D}\u{85}\u{200E}\u{200F}\u{2028}\u{2029} b", + changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, concat("a", "b")) + ) + parseTest( + "(?x)[a\t\u{A}\u{B}\u{C}\u{D}\u{85}\u{200E}\u{200F}\u{2028}\u{2029} b]", + changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, charClass("a", "b")) + ) + // MARK: Parse with delimiters parseWithDelimitersTest("'/a b/'", concat("a", " ", "b")) parseWithDelimitersTest("'|a b|'", concat("a", "b")) + parseWithDelimitersTest("'|[a b]|'", charClass("a", "b")) + parseWithDelimitersTest( + "'|(?-x)[a b]|'", changeMatchingOptions( + matchingOptions(removing: .extended), isIsolated: true, + charClass("a", " ", "b")) + ) + parseWithDelimitersTest("'|[[a ] b]|'", charClass(charClass("a"), "b")) + + // Non-semantic whitespace between quantifier characters for consistency + // with PCRE. + parseWithDelimitersTest("'|a * ?|'", zeroOrMore(.reluctant, "a")) + + // End-of-line comments aren't enabled by default in experimental syntax. + parseWithDelimitersTest("'|#abc|'", concat("#", "a", "b", "c")) + parseWithDelimitersTest("'|(?x)#abc|'", changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + empty()) + ) + parseWithDelimitersTest("'|||'", alt(empty(), empty())) parseWithDelimitersTest("'||||'", alt(empty(), empty(), empty())) parseWithDelimitersTest("'|a||'", alt("a", empty())) @@ -1285,6 +1441,7 @@ extension RegexTests { // Make sure dumping output correctly reflects differences in AST. parseNotEqualTest(#"abc"#, #"abd"#) + parseNotEqualTest(#" "#, #""#) parseNotEqualTest(#"[\p{Any}]"#, #"[[:Any:]]"#) @@ -1303,6 +1460,8 @@ extension RegexTests { parseNotEqualTest(#"([a-c&&e]*)+"#, #"([a-d&&e]*)+"#) + parseNotEqualTest(#"[abc]"#, #"[a b c]"#) + parseNotEqualTest(#"\1"#, #"\10"#) parseNotEqualTest("(?^:)", ("(?-:)")) @@ -1584,6 +1743,8 @@ extension RegexTests { diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName)) diagnosticTest(#"(?'a-b-c')"#, .expected("'")) + diagnosticTest("(?x)(? : )", .unknownGroupKind("? ")) + // MARK: Matching options diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret) From c389e5a21a7d5d9ec4951b46adc8fa0dc4a2bb08 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 31 Jan 2022 11:29:41 +0000 Subject: [PATCH 3/3] Tweak quantifier AST builders Because of the unlabeled parameters, callers were never able to use the `.eager` default. Give the child parameter an argument label so they can. --- Sources/_StringProcessing/ASTBuilder.swift | 24 ++--- Sources/_StringProcessing/RegexDSL/DSL.swift | 6 +- Tests/RegexTests/ParseTests.swift | 101 +++++++++---------- Tests/RegexTests/SyntaxOptionsTests.swift | 18 ++-- 4 files changed, 73 insertions(+), 76 deletions(-) diff --git a/Sources/_StringProcessing/ASTBuilder.swift b/Sources/_StringProcessing/ASTBuilder.swift index dda007ca6..bbc199d27 100644 --- a/Sources/_StringProcessing/ASTBuilder.swift +++ b/Sources/_StringProcessing/ASTBuilder.swift @@ -242,51 +242,51 @@ func quant( _ child: AST.Node ) -> AST.Node { .quantification(.init( - .init(faking: amount), .init(faking: kind), child, .fake)) + .init(faking: amount), .init(faking: kind), child, .fake, trivia: [])) } func zeroOrMore( _ kind: AST.Quantification.Kind = .eager, - _ child: AST.Node + of child: AST.Node ) -> AST.Node { quant(.zeroOrMore, kind, child) } func zeroOrOne( _ kind: AST.Quantification.Kind = .eager, - _ child: AST.Node + of child: AST.Node ) -> AST.Node { quant(.zeroOrOne, kind, child) } func oneOrMore( _ kind: AST.Quantification.Kind = .eager, - _ child: AST.Node + of child: AST.Node ) -> AST.Node { quant(.oneOrMore, kind, child) } func exactly( - _ kind: AST.Quantification.Kind = .eager, _ i: Int, - _ child: AST.Node + _ kind: AST.Quantification.Kind = .eager, + of child: AST.Node ) -> AST.Node { quant(.exactly(.init(faking: i)), kind, child) } func nOrMore( - _ kind: AST.Quantification.Kind = .eager, _ i: Int, - _ child: AST.Node + _ kind: AST.Quantification.Kind = .eager, + of child: AST.Node ) -> AST.Node { quant(.nOrMore(.init(faking: i)), kind, child) } func upToN( - _ kind: AST.Quantification.Kind = .eager, _ i: Int, - _ child: AST.Node + _ kind: AST.Quantification.Kind = .eager, + of child: AST.Node ) -> AST.Node { quant(.upToN(.init(faking: i)), kind, child) } func quantRange( - _ kind: AST.Quantification.Kind = .eager, _ r: ClosedRange, - _ child: AST.Node + _ kind: AST.Quantification.Kind = .eager, + of child: AST.Node ) -> AST.Node { let lower = AST.Located(faking: r.lowerBound) let upper = AST.Located(faking: r.upperBound) diff --git a/Sources/_StringProcessing/RegexDSL/DSL.swift b/Sources/_StringProcessing/RegexDSL/DSL.swift index f39ae20a7..fde14a674 100644 --- a/Sources/_StringProcessing/RegexDSL/DSL.swift +++ b/Sources/_StringProcessing/RegexDSL/DSL.swift @@ -67,7 +67,7 @@ public struct OneOrMore: RegexProtocolWithComponent { public init(component: Component) { self.regex = .init(ast: - oneOrMore(.eager, component.regex.ast.root) + oneOrMore(of: component.regex.ast.root) ) } @@ -93,7 +93,7 @@ public struct Repeat< public init(component: Component) { self.regex = .init(ast: - zeroOrMore(.eager, component.regex.ast.root)) + zeroOrMore(of: component.regex.ast.root)) } public init(@RegexBuilder _ content: () -> Component) { @@ -116,7 +116,7 @@ public struct Optionally: RegexProtocolWithComponent { public init(component: Component) { self.regex = .init(ast: - zeroOrOne(.eager, component.regex.ast.root)) + zeroOrOne(of: component.regex.ast.root)) } public init(@RegexBuilder _ content: () -> Component) { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 1346835fb..4722ec57c 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -201,7 +201,7 @@ extension RegexTests { "abc", concat("a", "b", "c")) parseTest( #"abc\+d*"#, - concat("a", "b", "c", "+", zeroOrMore(.eager, "d"))) + concat("a", "b", "c", "+", zeroOrMore(of: "d"))) parseTest( "a(b)", concat("a", capture("b")), captures: .atom()) @@ -211,31 +211,31 @@ extension RegexTests { concat( "a", "b", "c", oneOrMore( - .eager, nonCapture(concat("d", "e"))), - "f", "g", "h", zeroOrMore(.eager, "i"), "k"), + of: nonCapture(concat("d", "e"))), + "f", "g", "h", zeroOrMore(of: "i"), "k"), "j")) parseTest( "a(?:b|c)?d", concat("a", zeroOrOne( - .eager, nonCapture(alt("b", "c"))), "d")) + of: nonCapture(alt("b", "c"))), "d")) parseTest( "a?b??c+d+?e*f*?", concat( - zeroOrOne(.eager, "a"), zeroOrOne(.reluctant, "b"), - oneOrMore(.eager, "c"), oneOrMore(.reluctant, "d"), - zeroOrMore(.eager, "e"), zeroOrMore(.reluctant, "f"))) + zeroOrOne(of: "a"), zeroOrOne(.reluctant, of: "b"), + oneOrMore(of: "c"), oneOrMore(.reluctant, of: "d"), + zeroOrMore(of: "e"), zeroOrMore(.reluctant, of: "f"))) parseTest( "(.)*(.*)", concat( - zeroOrMore(.eager, capture(atom(.any))), - capture(zeroOrMore(.eager, atom(.any)))), + zeroOrMore(of: capture(atom(.any))), + capture(zeroOrMore(of: atom(.any)))), captures: .tuple([.array(.atom()), .atom()])) parseTest( "((.))*((.)?)", concat( - zeroOrMore(.eager, capture(capture(atom(.any)))), - capture(zeroOrOne(.eager, capture(atom(.any))))), + zeroOrMore(of: capture(capture(atom(.any)))), + capture(zeroOrOne(of: capture(atom(.any))))), captures: .tuple([ .array(.atom()), .array(.atom()), .atom(), .optional(.atom()) ])) @@ -247,7 +247,7 @@ extension RegexTests { parseTest( "a|b?c", - alt("a", concat(zeroOrOne(.eager, "b"), "c"))) + alt("a", concat(zeroOrOne(of: "b"), "c"))) parseTest( "(a|b)c", concat(capture(alt("a", "b")), "c"), @@ -419,7 +419,7 @@ extension RegexTests { parseTest( #"[a[bc]de&&[^bc]\d]+"#, - oneOrMore(.eager, charClass( + oneOrMore(of: charClass( .setOperation( ["a", charClass("b", "c"), "d", "e"], .init(faking: .intersection), @@ -448,13 +448,13 @@ extension RegexTests { parseTest( "a&&b", concat("a", "&", "&", "b")) parseTest( - "&?", zeroOrOne(.eager, "&")) + "&?", zeroOrOne(of: "&")) parseTest( - "&&?", concat("&", zeroOrOne(.eager, "&"))) + "&&?", concat("&", zeroOrOne(of: "&"))) parseTest( - "--+", concat("-", oneOrMore(.eager, "-"))) + "--+", concat("-", oneOrMore(of: "-"))) parseTest( - "~~*", concat("~", zeroOrMore(.eager, "~"))) + "~~*", concat("~", zeroOrMore(of: "~"))) // MARK: Quotes @@ -496,25 +496,25 @@ extension RegexTests { parseTest( #"a{1,2}"#, - quantRange(.eager, 1...2, "a")) + quantRange(1...2, of: "a")) parseTest( #"a{,2}"#, - upToN(.eager, 2, "a")) + upToN(2, of: "a")) parseTest( #"a{2,}"#, - nOrMore(.eager, 2, "a")) + nOrMore(2, of: "a")) parseTest( #"a{1}"#, - exactly(.eager, 1, "a")) + exactly(1, of: "a")) parseTest( #"a{1,2}?"#, - quantRange(.reluctant, 1...2, "a")) + quantRange(1...2, .reluctant, of: "a")) parseTest( #"a{0}"#, - exactly(.eager, 0, "a")) + exactly(0, of: "a")) parseTest( #"a{0,0}"#, - quantRange(.eager, 0...0, "a")) + quantRange(0...0, of: "a")) // Make sure ranges get treated as literal if invalid. parseTest("{", "{") @@ -524,16 +524,16 @@ extension RegexTests { parseTest("{,6", concat("{", ",", "6")) parseTest("{6", concat("{", "6")) parseTest("{6,", concat("{", "6", ",")) - parseTest("{+", oneOrMore(.eager, "{")) - parseTest("{6,+", concat("{", "6", oneOrMore(.eager, ","))) + parseTest("{+", oneOrMore(of: "{")) + parseTest("{6,+", concat("{", "6", oneOrMore(of: ","))) parseTest("x{", concat("x", "{")) parseTest("x{}", concat("x", "{", "}")) parseTest("x{,}", concat("x", "{", ",", "}")) parseTest("x{,6", concat("x", "{", ",", "6")) parseTest("x{6", concat("x", "{", "6")) parseTest("x{6,", concat("x", "{", "6", ",")) - parseTest("x{+", concat("x", oneOrMore(.eager, "{"))) - parseTest("x{6,+", concat("x", "{", "6", oneOrMore(.eager, ","))) + parseTest("x{+", concat("x", oneOrMore(of: "{"))) + parseTest("x{6,+", concat("x", "{", "6", oneOrMore(of: ","))) // TODO: We should emit a diagnostic for this. parseTest("x{3, 5}", concat("x", "{", "3", ",", " ", "5", "}")) @@ -915,14 +915,11 @@ extension RegexTests { parseTest(#"\N{abc}"#, atom(.namedCharacter("abc"))) parseTest(#"[\N{abc}]"#, charClass(atom_m(.namedCharacter("abc")))) - parseTest( - #"\N{abc}+"#, - oneOrMore(.eager, - atom(.namedCharacter("abc")))) + parseTest(#"\N{abc}+"#, oneOrMore(of: atom(.namedCharacter("abc")))) parseTest( #"\N {2}"#, - concat(atom(.escaped(.notNewline)), - exactly(.eager, 2, " "))) + concat(atom(.escaped(.notNewline)), exactly(2, of: " ")) + ) parseTest(#"\N{AA}"#, atom(.namedCharacter("AA"))) parseTest(#"\N{U+AA}"#, scalar("\u{AA}")) @@ -945,7 +942,7 @@ extension RegexTests { parseTest(#"[\p{C}]"#, charClass(prop_m(.generalCategory(.other)))) parseTest( #"\p{C}+"#, - oneOrMore(.eager, prop(.generalCategory(.other)))) + oneOrMore(of: prop(.generalCategory(.other)))) parseTest(#"\p{Lx}"#, prop(.other(key: nil, value: "Lx"))) parseTest(#"\p{gcL}"#, prop(.other(key: nil, value: "gcL"))) @@ -1064,7 +1061,7 @@ extension RegexTests { captures: .atom(name: "a1") ) - parseTest(#"(?(1))?"#, zeroOrOne(.eager, conditional( + parseTest(#"(?(1))?"#, zeroOrOne(of: conditional( .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()))) parseTest(#"(?(R)a|b)"#, conditional( @@ -1108,9 +1105,9 @@ extension RegexTests { parseTest(#"(?((a)?(b))(a)+|b)"#, conditional( groupCondition(.capture, concat( - zeroOrOne(.eager, capture("a")), capture("b") + zeroOrOne(of: capture("a")), capture("b") )), - trueBranch: oneOrMore(.eager, capture("a")), + trueBranch: oneOrMore(of: capture("a")), falseBranch: "b" ), captures: .tuple([ .atom(), .optional(.atom()), .atom(), .optional(.array(.atom())) @@ -1118,9 +1115,9 @@ extension RegexTests { parseTest(#"(?(?:(a)?(b))(a)+|b)"#, conditional( groupCondition(.nonCapture, concat( - zeroOrOne(.eager, capture("a")), capture("b") + zeroOrOne(of: capture("a")), capture("b") )), - trueBranch: oneOrMore(.eager, capture("a")), + trueBranch: oneOrMore(of: capture("a")), falseBranch: "b" ), captures: .tuple([ .optional(.atom()), .atom(), .optional(.array(.atom())) @@ -1190,10 +1187,10 @@ extension RegexTests { // MARK: Backtracking directives - parseTest("(*ACCEPT)?", zeroOrOne(.eager, backtrackingDirective(.accept))) + parseTest("(*ACCEPT)?", zeroOrOne(of: backtrackingDirective(.accept))) parseTest( "(*ACCEPT:a)??", - zeroOrOne(.reluctant, backtrackingDirective(.accept, name: "a")) + zeroOrOne(.reluctant, of: backtrackingDirective(.accept, name: "a")) ) parseTest("(*:a)", backtrackingDirective(.mark, name: "a")) parseTest("(*MARK:a)", backtrackingDirective(.mark, name: "a")) @@ -1208,17 +1205,17 @@ extension RegexTests { parseTest("(?~)", absentRepeater(empty())) parseTest("(?~abc)", absentRepeater(concat("a", "b", "c"))) - parseTest("(?~a+)", absentRepeater(oneOrMore(.eager, "a"))) + parseTest("(?~a+)", absentRepeater(oneOrMore(of: "a"))) parseTest("(?~~)", absentRepeater("~")) parseTest("(?~a|b|c)", absentRepeater(alt("a", "b", "c"))) parseTest("(?~(a))", absentRepeater(capture("a")), captures: .empty) - parseTest("(?~)*", zeroOrMore(.eager, absentRepeater(empty()))) + parseTest("(?~)*", zeroOrMore(of: absentRepeater(empty()))) parseTest("(?~|abc)", absentStopper(concat("a", "b", "c"))) - parseTest("(?~|a+)", absentStopper(oneOrMore(.eager, "a"))) + parseTest("(?~|a+)", absentStopper(oneOrMore(of: "a"))) parseTest("(?~|~)", absentStopper("~")) parseTest("(?~|(a))", absentStopper(capture("a")), captures: .empty) - parseTest("(?~|a){2}", exactly(.eager, 2, absentStopper("a"))) + parseTest("(?~|a){2}", exactly(2, of: absentStopper("a"))) parseTest("(?~|a|b)", absentExpression("a", "b")) parseTest("(?~|~|~)", absentExpression("~", "~")) @@ -1227,13 +1224,13 @@ extension RegexTests { parseTest("(?~|(a)|(?:(b)|c))", absentExpression( capture("a"), nonCapture(alt(capture("b"), "c")) ), captures: .optional(.atom())) - parseTest("(?~|a|b)?", zeroOrOne(.eager, absentExpression("a", "b"))) + parseTest("(?~|a|b)?", zeroOrOne(of: absentExpression("a", "b"))) parseTest("(?~|)", absentRangeClear()) // TODO: It's not really clear what this means, but Oniguruma parses it... // Maybe we should diagnose it? - parseTest("(?~|)+", oneOrMore(.eager, absentRangeClear())) + parseTest("(?~|)+", oneOrMore(of: absentRangeClear())) // MARK: Global matching options @@ -1372,19 +1369,19 @@ extension RegexTests { "(?x)a *", changeMatchingOptions( matchingOptions(adding: .extended), isIsolated: true, - zeroOrMore(.eager, "a")) + zeroOrMore(of: "a")) ) parseTest( "(?x)a + ?", changeMatchingOptions( matchingOptions(adding: .extended), isIsolated: true, - oneOrMore(.reluctant, "a")) + oneOrMore(.reluctant, of: "a")) ) parseTest( "(?x)a {2,4}", changeMatchingOptions( matchingOptions(adding: .extended), isIsolated: true, - quantRange(.eager, 2 ... 4, "a")) + quantRange(2 ... 4, of: "a")) ) // PCRE states that whitespace won't be ignored within a range. @@ -1424,7 +1421,7 @@ extension RegexTests { // Non-semantic whitespace between quantifier characters for consistency // with PCRE. - parseWithDelimitersTest("'|a * ?|'", zeroOrMore(.reluctant, "a")) + parseWithDelimitersTest("'|a * ?|'", zeroOrMore(.reluctant, of: "a")) // End-of-line comments aren't enabled by default in experimental syntax. parseWithDelimitersTest("'|#abc|'", concat("#", "a", "b", "c")) diff --git a/Tests/RegexTests/SyntaxOptionsTests.swift b/Tests/RegexTests/SyntaxOptionsTests.swift index d6e0d6bc5..881eb0cbc 100644 --- a/Tests/RegexTests/SyntaxOptionsTests.swift +++ b/Tests/RegexTests/SyntaxOptionsTests.swift @@ -15,7 +15,7 @@ import XCTest private let dplus = oneOrMore( - .eager, atom(.escaped(.decimalDigit))) + of: atom(.escaped(.decimalDigit))) private let dotAST = concat( dplus, ".", dplus, ".", dplus, ".", dplus) private let dotASTQuoted = concat( @@ -61,34 +61,34 @@ extension RegexTests { func testExperimentalRanges() { parseTest( #"a{1,2}"#, - quantRange(.eager, 1...2, "a")) + quantRange(1...2, of: "a")) parseTest( #"a{1...2}"#, - quantRange(.eager, 1...2, "a"), + quantRange(1...2, of: "a"), syntax: .experimentalRanges) parseTest( #"a{1..<3}"#, - quantRange(.eager, 1...2, "a"), + quantRange(1...2, of: "a"), syntax: .experimentalRanges) parseTest( #"a{,2}"#, - upToN(.eager, 2, "a")) + upToN(2, of: "a")) parseTest( #"a{...2}"#, - upToN(.eager, 2, "a"), + upToN(2, of: "a"), syntax: .experimental) parseTest( #"a{..<3}"#, - upToN(.eager, 2, "a"), + upToN(2, of: "a"), syntax: .experimental) parseTest( #"a{1,}"#, - nOrMore(.eager, 1, "a")) + nOrMore(1, of: "a")) parseTest( #"a{1...}"#, - nOrMore(.eager, 1, "a"), + nOrMore(1, of: "a"), syntax: .experimental) }