Skip to content

Parse global matching options #133

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion Sources/_MatchingEngine/Regex/AST/AST.swift
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@
/// node.
public struct AST: Hashable {
public var root: AST.Node
public init(_ root: AST.Node) {
public var globalOptions: GlobalMatchingOptionSequence?

public init(_ root: AST.Node, globalOptions: GlobalMatchingOptionSequence?) {
self.root = root
self.globalOptions = globalOptions
}
}

Expand Down Expand Up @@ -291,6 +294,20 @@ extension AST {
/// a group.
public var recursesWholePattern: Bool { kind == .recurseWholePattern }
}

/// A set of global matching options in a regular expression literal.
public struct GlobalMatchingOptionSequence: Hashable {
public var options: [AST.GlobalMatchingOption]

public init?(_ options: [AST.GlobalMatchingOption]) {
guard !options.isEmpty else { return nil }
self.options = options
}

public var location: SourceLocation {
options.first!.location.union(with: options.last!.location)
}
}
}

// FIXME: Get this out of here
Expand Down
79 changes: 79 additions & 0 deletions Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,82 @@ extension AST.MatchingOptionSequence: _ASTPrintable {
"adding: \(adding), removing: \(removing), hasCaret: \(caretLoc != nil)"
}
}

extension AST {
/// Global matching option specifiers. Unlike `MatchingOptionSequence`,
/// these must appear at the start of the pattern, and apply globally.
public struct GlobalMatchingOption: _ASTNode, Hashable {
/// Determines the definition of a newline for the '.' character class.
public enum NewlineMatching: Hashable {
/// (*CR*)
case carriageReturnOnly

/// (*LF)
case linefeedOnly

/// (*CRLF)
case carriageAndLinefeedOnly

/// (*ANYCRLF)
case anyCarriageReturnOrLinefeed

/// (*ANY)
case anyUnicode

/// (*NUL)
case nulCharacter
}
/// Determines what `\R` matches.
public enum NewlineSequenceMatching: Hashable {
/// (*BSR_ANYCRLF)
case anyCarriageReturnOrLinefeed

/// (*BSR_UNICODE)
case anyUnicode
}
public enum Kind: Hashable {
/// (*LIMIT_DEPTH=d)
case limitDepth(Located<Int>)

/// (*LIMIT_HEAP=d)
case limitHeap(Located<Int>)

/// (*LIMIT_MATCH=d)
case limitMatch(Located<Int>)

/// (*NOTEMPTY)
case notEmpty

/// (*NOTEMPTY_ATSTART)
case notEmptyAtStart

/// (*NO_AUTO_POSSESS)
case noAutoPossess

/// (*NO_DOTSTAR_ANCHOR)
case noDotStarAnchor

/// (*NO_JIT)
case noJIT

/// (*NO_START_OPT)
case noStartOpt

/// (*UTF)
case utfMode

/// (*UCP)
case unicodeProperties

case newlineMatching(NewlineMatching)
case newlineSequenceMatching(NewlineSequenceMatching)
}
public var kind: Kind
public var location: SourceLocation

public init(_ kind: Kind, _ location: SourceLocation) {
self.kind = kind
self.location = location
}
}
}
4 changes: 4 additions & 0 deletions Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ enum ParseError: Error, Hashable {

case tooManyAbsentExpressionChildren(Int)

case globalMatchingOptionNotAtStart(String)

case expectedASCII(Character)

case expectedNonEmptyContents
Expand Down Expand Up @@ -116,6 +118,8 @@ extension ParseError: CustomStringConvertible {
return "\(str) cannot be used as condition"
case let .tooManyAbsentExpressionChildren(i):
return "expected 2 expressions in absent expression, have \(i)"
case let .globalMatchingOptionNotAtStart(opt):
return "matching option '\(opt)' may only appear at the start of the regex"
case let .unknownGroupKind(str):
return "unknown group kind '(\(str)'"
case let .unknownCalloutKind(str):
Expand Down
116 changes: 116 additions & 0 deletions Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1638,6 +1638,12 @@ extension Source {
return .backtrackingDirective(b)
}

// Global matching options can only appear at the very start.
if let opt = try src.lexGlobalMatchingOption() {
throw ParseError.globalMatchingOptionNotAtStart(
String(src[opt.location.range]))
}

// (?C)
if let callout = try src.lexPCRECallout() {
return .callout(callout)
Expand Down Expand Up @@ -1743,5 +1749,115 @@ extension Source {
}
return (dash, end)
}

/// Try to consume a newline sequence matching option kind.
///
/// NewlineSequenceKind -> 'BSR_ANYCRLF' | 'BSR_UNICODE'
///
private mutating func lexNewlineSequenceMatchingOption(
) throws -> AST.GlobalMatchingOption.NewlineSequenceMatching? {
if tryEat(sequence: "BSR_ANYCRLF") { return .anyCarriageReturnOrLinefeed }
if tryEat(sequence: "BSR_UNICODE") { return .anyUnicode }
return nil
}

/// Try to consume a newline matching option kind.
///
/// NewlineKind -> 'CRLF' | 'CR' | 'ANYCRLF' | 'ANY' | 'LF' | 'NUL'
///
private mutating func lexNewlineMatchingOption(
) throws -> AST.GlobalMatchingOption.NewlineMatching? {
// The ordering here is important: CRLF needs to precede CR, and ANYCRLF
// needs to precede ANY to ensure we don't short circuit on the wrong one.
if tryEat(sequence: "CRLF") { return .carriageAndLinefeedOnly }
if tryEat(sequence: "CR") { return .carriageReturnOnly }
if tryEat(sequence: "ANYCRLF") { return .anyCarriageReturnOrLinefeed }
if tryEat(sequence: "ANY") { return .anyUnicode }

if tryEat(sequence: "LF") { return .linefeedOnly }
if tryEat(sequence: "NUL") { return .nulCharacter }
return nil
}

/// Try to consume a global matching option kind, returning `nil` if
/// unsuccessful.
///
/// GlobalMatchingOptionKind -> LimitOptionKind '=' <Int>
/// | NewlineKind | NewlineSequenceKind
/// | 'NOTEMPTY_ATSTART' | 'NOTEMPTY'
/// | 'NO_AUTO_POSSESS' | 'NO_DOTSTAR_ANCHOR'
/// | 'NO_JIT' | 'NO_START_OPT' | 'UTF' | 'UCP'
///
/// LimitOptionKind -> 'LIMIT_DEPTH' | 'LIMIT_HEAP'
/// | 'LIMIT_MATCH'
///
private mutating func lexGlobalMatchingOptionKind(
) throws -> Located<AST.GlobalMatchingOption.Kind>? {
try recordLoc { src in
if let opt = try src.lexNewlineSequenceMatchingOption() {
return .newlineSequenceMatching(opt)
}
if let opt = try src.lexNewlineMatchingOption() {
return .newlineMatching(opt)
}
if src.tryEat(sequence: "LIMIT_DEPTH") {
try src.expect("=")
return .limitDepth(try src.expectNumber())
}
if src.tryEat(sequence: "LIMIT_HEAP") {
try src.expect("=")
return .limitHeap(try src.expectNumber())
}
if src.tryEat(sequence: "LIMIT_MATCH") {
try src.expect("=")
return .limitMatch(try src.expectNumber())
}

// The ordering here is important: NOTEMPTY_ATSTART needs to precede
// NOTEMPTY to ensure we don't short circuit on the wrong one.
if src.tryEat(sequence: "NOTEMPTY_ATSTART") { return .notEmptyAtStart }
if src.tryEat(sequence: "NOTEMPTY") { return .notEmpty }

if src.tryEat(sequence: "NO_AUTO_POSSESS") { return .noAutoPossess }
if src.tryEat(sequence: "NO_DOTSTAR_ANCHOR") { return .noDotStarAnchor }
if src.tryEat(sequence: "NO_JIT") { return .noJIT }
if src.tryEat(sequence: "NO_START_OPT") { return .noStartOpt }
if src.tryEat(sequence: "UTF") { return .utfMode }
if src.tryEat(sequence: "UCP") { return .unicodeProperties }
return nil
}
}

/// Try to consume a global matching option, returning `nil` if unsuccessful.
///
/// GlobalMatchingOption -> '(*' GlobalMatchingOptionKind ')'
///
mutating func lexGlobalMatchingOption(
) throws -> AST.GlobalMatchingOption? {
let kind = try recordLoc { src -> AST.GlobalMatchingOption.Kind? in
try src.tryEating { src in
guard src.tryEat(sequence: "(*"),
let kind = try src.lexGlobalMatchingOptionKind()?.value
else { return nil }
try src.expect(")")
return kind
}
}
guard let kind = kind else { return nil }
return .init(kind.value, kind.location)
}

/// Try to consume a sequence of global matching options.
///
/// GlobalMatchingOptionSequence -> GlobalMatchingOption+
///
mutating func lexGlobalMatchingOptionSequence(
) throws -> AST.GlobalMatchingOptionSequence? {
var opts: [AST.GlobalMatchingOption] = []
while let opt = try lexGlobalMatchingOption() {
opts.append(opt)
}
return .init(opts)
}
}

11 changes: 9 additions & 2 deletions Sources/_MatchingEngine/Regex/Parse/Parse.swift
Original file line number Diff line number Diff line change
Expand Up @@ -117,17 +117,24 @@ extension Parser {
/// Parse a top-level regular expression. Do not use for recursive calls, use
/// `parseNode()` instead.
///
/// Regex -> RegexNode
/// Regex -> GlobalMatchingOptionSequence? RegexNode
///
mutating func parse() throws -> AST {
// First parse any global matching options if present.
let opts = try source.lexGlobalMatchingOptionSequence()

// Then parse the root AST node.
let ast = try parseNode()
guard source.isEmpty else {
// parseConcatenation() terminates on encountering a ')' to enable
// recursive parses of a group body. However for a top-level parse, this
// means we have an unmatched closing paren, so let's diagnose.
if let loc = source.tryEatWithLoc(")") {
throw Source.LocatedError(ParseError.unbalancedEndOfGroup, loc)
}
fatalError("Unhandled termination condition")
}
return .init(ast)
return .init(ast, globalOptions: opts)
}

/// Parse a regular expression node. This should be used instead of `parse()`
Expand Down
21 changes: 20 additions & 1 deletion Sources/_MatchingEngine/Regex/Printing/DumpAST.swift
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,12 @@ extension _ASTPrintable {

extension AST: _ASTPrintable {
public var _dumpBase: String {
root._dumpBase
var result = ""
if let opts = globalOptions {
result += "\(opts) "
}
result += root._dump()
return result
}
}

Expand Down Expand Up @@ -341,3 +346,17 @@ extension AST.AbsentFunction {
"absent function \(kind._dumpBase)"
}
}

extension AST.GlobalMatchingOption.Kind: _ASTPrintable {
public var _dumpBase: String { _canonicalBase }
}

extension AST.GlobalMatchingOption: _ASTPrintable {
public var _dumpBase: String { "\(kind._dumpBase)" }
}

extension AST.GlobalMatchingOptionSequence: _ASTPrintable {
public var _dumpBase: String {
"GlobalMatchingOptionSequence<\(options)>"
}
}
Loading