Skip to content

Commit cf2d910

Browse files
authored
Merge pull request #133 from hamishknight/round-again
2 parents a6132a5 + 8177a3e commit cf2d910

File tree

12 files changed

+389
-30
lines changed

12 files changed

+389
-30
lines changed

Sources/_MatchingEngine/Regex/AST/AST.swift

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,11 @@
1313
/// node.
1414
public struct AST: Hashable {
1515
public var root: AST.Node
16-
public init(_ root: AST.Node) {
16+
public var globalOptions: GlobalMatchingOptionSequence?
17+
18+
public init(_ root: AST.Node, globalOptions: GlobalMatchingOptionSequence?) {
1719
self.root = root
20+
self.globalOptions = globalOptions
1821
}
1922
}
2023

@@ -291,6 +294,20 @@ extension AST {
291294
/// a group.
292295
public var recursesWholePattern: Bool { kind == .recurseWholePattern }
293296
}
297+
298+
/// A set of global matching options in a regular expression literal.
299+
public struct GlobalMatchingOptionSequence: Hashable {
300+
public var options: [AST.GlobalMatchingOption]
301+
302+
public init?(_ options: [AST.GlobalMatchingOption]) {
303+
guard !options.isEmpty else { return nil }
304+
self.options = options
305+
}
306+
307+
public var location: SourceLocation {
308+
options.first!.location.union(with: options.last!.location)
309+
}
310+
}
294311
}
295312

296313
// FIXME: Get this out of here

Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,82 @@ extension AST.MatchingOptionSequence: _ASTPrintable {
105105
"adding: \(adding), removing: \(removing), hasCaret: \(caretLoc != nil)"
106106
}
107107
}
108+
109+
extension AST {
110+
/// Global matching option specifiers. Unlike `MatchingOptionSequence`,
111+
/// these must appear at the start of the pattern, and apply globally.
112+
public struct GlobalMatchingOption: _ASTNode, Hashable {
113+
/// Determines the definition of a newline for the '.' character class.
114+
public enum NewlineMatching: Hashable {
115+
/// (*CR*)
116+
case carriageReturnOnly
117+
118+
/// (*LF)
119+
case linefeedOnly
120+
121+
/// (*CRLF)
122+
case carriageAndLinefeedOnly
123+
124+
/// (*ANYCRLF)
125+
case anyCarriageReturnOrLinefeed
126+
127+
/// (*ANY)
128+
case anyUnicode
129+
130+
/// (*NUL)
131+
case nulCharacter
132+
}
133+
/// Determines what `\R` matches.
134+
public enum NewlineSequenceMatching: Hashable {
135+
/// (*BSR_ANYCRLF)
136+
case anyCarriageReturnOrLinefeed
137+
138+
/// (*BSR_UNICODE)
139+
case anyUnicode
140+
}
141+
public enum Kind: Hashable {
142+
/// (*LIMIT_DEPTH=d)
143+
case limitDepth(Located<Int>)
144+
145+
/// (*LIMIT_HEAP=d)
146+
case limitHeap(Located<Int>)
147+
148+
/// (*LIMIT_MATCH=d)
149+
case limitMatch(Located<Int>)
150+
151+
/// (*NOTEMPTY)
152+
case notEmpty
153+
154+
/// (*NOTEMPTY_ATSTART)
155+
case notEmptyAtStart
156+
157+
/// (*NO_AUTO_POSSESS)
158+
case noAutoPossess
159+
160+
/// (*NO_DOTSTAR_ANCHOR)
161+
case noDotStarAnchor
162+
163+
/// (*NO_JIT)
164+
case noJIT
165+
166+
/// (*NO_START_OPT)
167+
case noStartOpt
168+
169+
/// (*UTF)
170+
case utfMode
171+
172+
/// (*UCP)
173+
case unicodeProperties
174+
175+
case newlineMatching(NewlineMatching)
176+
case newlineSequenceMatching(NewlineSequenceMatching)
177+
}
178+
public var kind: Kind
179+
public var location: SourceLocation
180+
181+
public init(_ kind: Kind, _ location: SourceLocation) {
182+
self.kind = kind
183+
self.location = location
184+
}
185+
}
186+
}

Sources/_MatchingEngine/Regex/Parse/Diagnostics.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ enum ParseError: Error, Hashable {
3333

3434
case tooManyAbsentExpressionChildren(Int)
3535

36+
case globalMatchingOptionNotAtStart(String)
37+
3638
case expectedASCII(Character)
3739

3840
case expectedNonEmptyContents
@@ -116,6 +118,8 @@ extension ParseError: CustomStringConvertible {
116118
return "\(str) cannot be used as condition"
117119
case let .tooManyAbsentExpressionChildren(i):
118120
return "expected 2 expressions in absent expression, have \(i)"
121+
case let .globalMatchingOptionNotAtStart(opt):
122+
return "matching option '\(opt)' may only appear at the start of the regex"
119123
case let .unknownGroupKind(str):
120124
return "unknown group kind '(\(str)'"
121125
case let .unknownCalloutKind(str):

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1638,6 +1638,12 @@ extension Source {
16381638
return .backtrackingDirective(b)
16391639
}
16401640

1641+
// Global matching options can only appear at the very start.
1642+
if let opt = try src.lexGlobalMatchingOption() {
1643+
throw ParseError.globalMatchingOptionNotAtStart(
1644+
String(src[opt.location.range]))
1645+
}
1646+
16411647
// (?C)
16421648
if let callout = try src.lexPCRECallout() {
16431649
return .callout(callout)
@@ -1743,5 +1749,115 @@ extension Source {
17431749
}
17441750
return (dash, end)
17451751
}
1752+
1753+
/// Try to consume a newline sequence matching option kind.
1754+
///
1755+
/// NewlineSequenceKind -> 'BSR_ANYCRLF' | 'BSR_UNICODE'
1756+
///
1757+
private mutating func lexNewlineSequenceMatchingOption(
1758+
) throws -> AST.GlobalMatchingOption.NewlineSequenceMatching? {
1759+
if tryEat(sequence: "BSR_ANYCRLF") { return .anyCarriageReturnOrLinefeed }
1760+
if tryEat(sequence: "BSR_UNICODE") { return .anyUnicode }
1761+
return nil
1762+
}
1763+
1764+
/// Try to consume a newline matching option kind.
1765+
///
1766+
/// NewlineKind -> 'CRLF' | 'CR' | 'ANYCRLF' | 'ANY' | 'LF' | 'NUL'
1767+
///
1768+
private mutating func lexNewlineMatchingOption(
1769+
) throws -> AST.GlobalMatchingOption.NewlineMatching? {
1770+
// The ordering here is important: CRLF needs to precede CR, and ANYCRLF
1771+
// needs to precede ANY to ensure we don't short circuit on the wrong one.
1772+
if tryEat(sequence: "CRLF") { return .carriageAndLinefeedOnly }
1773+
if tryEat(sequence: "CR") { return .carriageReturnOnly }
1774+
if tryEat(sequence: "ANYCRLF") { return .anyCarriageReturnOrLinefeed }
1775+
if tryEat(sequence: "ANY") { return .anyUnicode }
1776+
1777+
if tryEat(sequence: "LF") { return .linefeedOnly }
1778+
if tryEat(sequence: "NUL") { return .nulCharacter }
1779+
return nil
1780+
}
1781+
1782+
/// Try to consume a global matching option kind, returning `nil` if
1783+
/// unsuccessful.
1784+
///
1785+
/// GlobalMatchingOptionKind -> LimitOptionKind '=' <Int>
1786+
/// | NewlineKind | NewlineSequenceKind
1787+
/// | 'NOTEMPTY_ATSTART' | 'NOTEMPTY'
1788+
/// | 'NO_AUTO_POSSESS' | 'NO_DOTSTAR_ANCHOR'
1789+
/// | 'NO_JIT' | 'NO_START_OPT' | 'UTF' | 'UCP'
1790+
///
1791+
/// LimitOptionKind -> 'LIMIT_DEPTH' | 'LIMIT_HEAP'
1792+
/// | 'LIMIT_MATCH'
1793+
///
1794+
private mutating func lexGlobalMatchingOptionKind(
1795+
) throws -> Located<AST.GlobalMatchingOption.Kind>? {
1796+
try recordLoc { src in
1797+
if let opt = try src.lexNewlineSequenceMatchingOption() {
1798+
return .newlineSequenceMatching(opt)
1799+
}
1800+
if let opt = try src.lexNewlineMatchingOption() {
1801+
return .newlineMatching(opt)
1802+
}
1803+
if src.tryEat(sequence: "LIMIT_DEPTH") {
1804+
try src.expect("=")
1805+
return .limitDepth(try src.expectNumber())
1806+
}
1807+
if src.tryEat(sequence: "LIMIT_HEAP") {
1808+
try src.expect("=")
1809+
return .limitHeap(try src.expectNumber())
1810+
}
1811+
if src.tryEat(sequence: "LIMIT_MATCH") {
1812+
try src.expect("=")
1813+
return .limitMatch(try src.expectNumber())
1814+
}
1815+
1816+
// The ordering here is important: NOTEMPTY_ATSTART needs to precede
1817+
// NOTEMPTY to ensure we don't short circuit on the wrong one.
1818+
if src.tryEat(sequence: "NOTEMPTY_ATSTART") { return .notEmptyAtStart }
1819+
if src.tryEat(sequence: "NOTEMPTY") { return .notEmpty }
1820+
1821+
if src.tryEat(sequence: "NO_AUTO_POSSESS") { return .noAutoPossess }
1822+
if src.tryEat(sequence: "NO_DOTSTAR_ANCHOR") { return .noDotStarAnchor }
1823+
if src.tryEat(sequence: "NO_JIT") { return .noJIT }
1824+
if src.tryEat(sequence: "NO_START_OPT") { return .noStartOpt }
1825+
if src.tryEat(sequence: "UTF") { return .utfMode }
1826+
if src.tryEat(sequence: "UCP") { return .unicodeProperties }
1827+
return nil
1828+
}
1829+
}
1830+
1831+
/// Try to consume a global matching option, returning `nil` if unsuccessful.
1832+
///
1833+
/// GlobalMatchingOption -> '(*' GlobalMatchingOptionKind ')'
1834+
///
1835+
mutating func lexGlobalMatchingOption(
1836+
) throws -> AST.GlobalMatchingOption? {
1837+
let kind = try recordLoc { src -> AST.GlobalMatchingOption.Kind? in
1838+
try src.tryEating { src in
1839+
guard src.tryEat(sequence: "(*"),
1840+
let kind = try src.lexGlobalMatchingOptionKind()?.value
1841+
else { return nil }
1842+
try src.expect(")")
1843+
return kind
1844+
}
1845+
}
1846+
guard let kind = kind else { return nil }
1847+
return .init(kind.value, kind.location)
1848+
}
1849+
1850+
/// Try to consume a sequence of global matching options.
1851+
///
1852+
/// GlobalMatchingOptionSequence -> GlobalMatchingOption+
1853+
///
1854+
mutating func lexGlobalMatchingOptionSequence(
1855+
) throws -> AST.GlobalMatchingOptionSequence? {
1856+
var opts: [AST.GlobalMatchingOption] = []
1857+
while let opt = try lexGlobalMatchingOption() {
1858+
opts.append(opt)
1859+
}
1860+
return .init(opts)
1861+
}
17461862
}
17471863

Sources/_MatchingEngine/Regex/Parse/Parse.swift

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,17 +117,24 @@ extension Parser {
117117
/// Parse a top-level regular expression. Do not use for recursive calls, use
118118
/// `parseNode()` instead.
119119
///
120-
/// Regex -> RegexNode
120+
/// Regex -> GlobalMatchingOptionSequence? RegexNode
121121
///
122122
mutating func parse() throws -> AST {
123+
// First parse any global matching options if present.
124+
let opts = try source.lexGlobalMatchingOptionSequence()
125+
126+
// Then parse the root AST node.
123127
let ast = try parseNode()
124128
guard source.isEmpty else {
129+
// parseConcatenation() terminates on encountering a ')' to enable
130+
// recursive parses of a group body. However for a top-level parse, this
131+
// means we have an unmatched closing paren, so let's diagnose.
125132
if let loc = source.tryEatWithLoc(")") {
126133
throw Source.LocatedError(ParseError.unbalancedEndOfGroup, loc)
127134
}
128135
fatalError("Unhandled termination condition")
129136
}
130-
return .init(ast)
137+
return .init(ast, globalOptions: opts)
131138
}
132139

133140
/// Parse a regular expression node. This should be used instead of `parse()`

Sources/_MatchingEngine/Regex/Printing/DumpAST.swift

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,12 @@ extension _ASTPrintable {
5858

5959
extension AST: _ASTPrintable {
6060
public var _dumpBase: String {
61-
root._dumpBase
61+
var result = ""
62+
if let opts = globalOptions {
63+
result += "\(opts) "
64+
}
65+
result += root._dump()
66+
return result
6267
}
6368
}
6469

@@ -341,3 +346,17 @@ extension AST.AbsentFunction {
341346
"absent function \(kind._dumpBase)"
342347
}
343348
}
349+
350+
extension AST.GlobalMatchingOption.Kind: _ASTPrintable {
351+
public var _dumpBase: String { _canonicalBase }
352+
}
353+
354+
extension AST.GlobalMatchingOption: _ASTPrintable {
355+
public var _dumpBase: String { "\(kind._dumpBase)" }
356+
}
357+
358+
extension AST.GlobalMatchingOptionSequence: _ASTPrintable {
359+
public var _dumpBase: String {
360+
"GlobalMatchingOptionSequence<\(options)>"
361+
}
362+
}

0 commit comments

Comments
 (0)