Skip to content

Commit cc6ad70

Browse files
committed
Parse extended syntax
If the `(?x)` or `(?xx)` options are active in a given scope, treat whitespace as non-semantic, including in custom character classes. Additionally parse end-of-line comments `# comment`.
1 parent 6e8d176 commit cc6ad70

File tree

12 files changed

+325
-25
lines changed

12 files changed

+325
-25
lines changed

Sources/_MatchingEngine/Regex/AST/CustomCharClass.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ extension AST {
4242
/// the contents should be interpreted literally.
4343
case quote(Quote)
4444

45+
/// Trivia such as non-semantic whitespace.
46+
case trivia(Trivia)
47+
4548
/// A binary operator applied to sets of members `abc&&def`
4649
case setOperation([Member], Located<SetOp>, [Member])
4750
}
@@ -81,6 +84,7 @@ extension CustomCC.Member {
8184
case .range(let r): return r
8285
case .atom(let a): return a
8386
case .quote(let q): return q
87+
case .trivia(let t): return t
8488
case .setOperation(let lhs, let op, let rhs): return (lhs, op, rhs)
8589
}
8690
}

Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,16 @@ extension AST {
4545
self.location = location
4646
}
4747

48+
/// If this is either the regular or extra extended syntax option.
49+
public var isAnyExtended: Bool {
50+
switch kind {
51+
case .extended, .extraExtended:
52+
return true
53+
default:
54+
return false
55+
}
56+
}
57+
4858
public var isTextSegmentMode: Bool {
4959
switch kind {
5060
case .textSegmentGraphemeMode, .textSegmentWordMode:
@@ -79,6 +89,10 @@ extension AST {
7989
self.minusLoc = minusLoc
8090
self.removing = removing
8191
}
92+
93+
/// Whether this set of matching options first resets the options before
94+
/// adding onto them.
95+
public var resetsCurrentOptions: Bool { caretLoc != nil }
8296
}
8397
}
8498

@@ -88,7 +102,10 @@ extension AST.MatchingOption: _ASTPrintable {
88102

89103
extension AST.MatchingOptionSequence: _ASTPrintable {
90104
public var _dumpBase: String {
91-
"adding: \(adding), removing: \(removing), hasCaret: \(caretLoc != nil)"
105+
"""
106+
adding: \(adding), removing: \(removing), \
107+
resetsCurrentOptions: \(resetsCurrentOptions)
108+
"""
92109
}
93110
}
94111

Sources/_MatchingEngine/Regex/AST/Quantification.swift

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,23 @@ extension AST {
1717
public let child: AST.Node
1818
public let location: SourceLocation
1919

20+
/// Any trivia intermixed between the operand and the quantifier, as well
21+
/// as between the quantifier characters themselves. This can occur in
22+
/// extended syntax mode where PCRE permits e.g `x * +`.
23+
public let trivia: [AST.Trivia]
24+
2025
public init(
2126
_ amount: Located<Amount>,
2227
_ kind: Located<Kind>,
2328
_ child: AST.Node,
24-
_ r: SourceLocation
29+
_ r: SourceLocation,
30+
trivia: [AST.Trivia]
2531
) {
2632
self.amount = amount
2733
self.kind = kind
2834
self.child = child
2935
self.location = r
36+
self.trivia = trivia
3037
}
3138

3239
@frozen

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -332,9 +332,15 @@ extension Source {
332332
/// Quantifier -> ('*' | '+' | '?' | '{' Range '}') QuantKind?
333333
/// QuantKind -> '?' | '+'
334334
///
335-
mutating func lexQuantifier(context: ParsingContext) throws -> (
336-
Located<Quant.Amount>, Located<Quant.Kind>
337-
)? {
335+
mutating func lexQuantifier(
336+
context: ParsingContext
337+
) throws -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
338+
var trivia: [AST.Trivia] = []
339+
340+
if let t = try lexNonSemanticWhitespace(context: context) {
341+
trivia.append(t)
342+
}
343+
338344
let amt: Located<Quant.Amount>? = try recordLoc { src in
339345
if src.tryEat("*") { return .zeroOrMore }
340346
if src.tryEat("+") { return .oneOrMore }
@@ -350,13 +356,18 @@ extension Source {
350356
}
351357
guard let amt = amt else { return nil }
352358

359+
// PCRE allows non-semantic whitespace here in extended syntax mode.
360+
if let t = try lexNonSemanticWhitespace(context: context) {
361+
trivia.append(t)
362+
}
363+
353364
let kind: Located<Quant.Kind> = recordLoc { src in
354365
if src.tryEat("?") { return .reluctant }
355366
if src.tryEat("+") { return .possessive }
356367
return .eager
357368
}
358369

359-
return (amt, kind)
370+
return (amt, kind, trivia)
360371
}
361372

362373
/// Try to consume a range, returning `nil` if unsuccessful.
@@ -501,6 +512,10 @@ extension Source {
501512
///
502513
/// ExpComment -> '/*' (!'*/' .)* '*/'
503514
///
515+
/// With `SyntaxOptions.endOfLineComments`
516+
///
517+
/// EndOfLineComment -> '#' .*
518+
///
504519
/// TODO: Swift-style nested comments, line-ending comments, etc
505520
///
506521
mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? {
@@ -511,6 +526,13 @@ extension Source {
511526
if context.experimentalComments, src.tryEat(sequence: "/*") {
512527
return try src.expectQuoted(endingWith: "*/").value
513528
}
529+
if context.endOfLineComments, src.tryEat("#") {
530+
// TODO: If we ever support multi-line regex literals, this will need
531+
// to be updated to stop at a newline. Note though that PCRE specifies
532+
// that the newline it matches against can be controlled by the global
533+
// matching options e.g `(*CR)`, `(*ANY)`, ...
534+
return src.lexUntil(\.isEmpty).value
535+
}
514536
return nil
515537
}
516538
guard let trivia = trivia else { return nil }
@@ -519,15 +541,36 @@ extension Source {
519541

520542
/// Try to consume non-semantic whitespace as trivia
521543
///
522-
/// Whitespace -> ' '+
544+
/// Whitespace -> WhitespaceChar+
523545
///
524546
/// Does nothing unless `SyntaxOptions.nonSemanticWhitespace` is set
525547
mutating func lexNonSemanticWhitespace(
526548
context: ParsingContext
527549
) throws -> AST.Trivia? {
528550
guard context.ignoreWhitespace else { return nil }
551+
552+
func isWhitespace(_ c: Character) -> Bool {
553+
// This is a list of characters that PCRE treats as whitespace when
554+
// compiled with Unicode support. It is a subset of the characters with
555+
// the `.isWhitespace` property. ICU appears to also follow this list.
556+
// Oniguruma and .NET follow a subset of this list.
557+
//
558+
// FIXME: PCRE only treats space and tab characters as whitespace when
559+
// inside a custom character class (and only treats whitespace as
560+
// non-semantic there for the extra-extended `(?xx)` mode). If we get a
561+
// strict-PCRE mode, we'll need to add a case for that.
562+
switch c {
563+
case " ", "\u{9}"..."\u{D}", // space, \t, \n, vertical tab, \f, \r
564+
"\u{85}", "\u{200E}", // next line, left-to-right mark
565+
"\u{200F}", "\u{2028}", // right-to-left-mark, line separator
566+
"\u{2029}": // paragraph separator
567+
return true
568+
default:
569+
return false
570+
}
571+
}
529572
let trivia: Located<String>? = recordLoc { src in
530-
src.tryEatPrefix { $0 == " " }?.string
573+
src.tryEatPrefix(isWhitespace)?.string
531574
}
532575
guard let trivia = trivia else { return nil }
533576
return AST.Trivia(trivia)
@@ -1622,7 +1665,7 @@ extension Source {
16221665
var name: Located<String>?
16231666
if src.tryEat(":") {
16241667
// TODO: PCRE allows escaped delimiters or '\Q...\E' sequences in the
1625-
// name under PCRE2_ALT_VERBNAMES.
1668+
// name under PCRE2_ALT_VERBNAMES. It also allows whitespace under (?x).
16261669
name = try src.expectQuoted(endingWith: ")", eatEnding: false)
16271670
}
16281671
try src.expect(")")

Sources/_MatchingEngine/Regex/Parse/Parse.swift

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ extension ParsingContext {
106106
var experimentalQuotes: Bool { syntax.contains(.experimentalQuotes) }
107107
var experimentalComments: Bool { syntax.contains(.experimentalComments) }
108108
var ignoreWhitespace: Bool { syntax.contains(.nonSemanticWhitespace) }
109+
var endOfLineComments: Bool { syntax.contains(.endOfLineComments) }
109110
}
110111

111112
// Diagnostics
@@ -208,12 +209,14 @@ extension Parser {
208209
}
209210
// Quantification -> QuantOperand Quantifier?
210211
if let operand = try parseQuantifierOperand() {
211-
if let (amt, kind) = try source.lexQuantifier(context: context) {
212+
if let (amt, kind, trivia) =
213+
try source.lexQuantifier(context: context) {
212214
let location = loc(_start)
213215
guard operand.isQuantifiable else {
214216
throw Source.LocatedError(ParseError.notQuantifiable, location)
215217
}
216-
result.append(.quantification(.init(amt, kind, operand, location)))
218+
result.append(.quantification(
219+
.init(amt, kind, operand, location, trivia: trivia)))
217220
} else {
218221
result.append(operand)
219222
}
@@ -270,6 +273,28 @@ extension Parser {
270273
) throws -> AST.Group {
271274
context.recordGroup(kind.value)
272275

276+
// Check if we're introducing or removing extended syntax.
277+
// TODO: PCRE differentiates between (?x) and (?xx) where only the latter
278+
// handles non-semantic whitespace in a custom character class. Other
279+
// engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
280+
// treat (?x) and (?xx) as the same option here. If we ever get a strict
281+
// PCRE mode, we will need to change this to handle that.
282+
let currentSyntax = context.syntax
283+
if case .changeMatchingOptions(let c, isIsolated: _) = kind.value {
284+
if c.resetsCurrentOptions {
285+
context.syntax.remove(.extendedSyntax)
286+
}
287+
if c.adding.contains(where: \.isAnyExtended) {
288+
context.syntax.insert(.extendedSyntax)
289+
}
290+
if c.removing.contains(where: \.isAnyExtended) {
291+
context.syntax.remove(.extendedSyntax)
292+
}
293+
}
294+
defer {
295+
context.syntax = currentSyntax
296+
}
297+
273298
let child = try parseNode()
274299
// An implicit scoped group has already consumed its closing paren.
275300
if !kind.value.hasImplicitScope {
@@ -449,6 +474,13 @@ extension Parser {
449474
continue
450475
}
451476

477+
// Lex non-semantic whitespace if we're allowed.
478+
// TODO: ICU allows end-of-line comments in custom character classes,
479+
// which we ought to support if we want to support multi-line regex.
480+
if let trivia = try source.lexNonSemanticWhitespace(context: context) {
481+
members.append(.trivia(trivia))
482+
}
483+
452484
guard let atom = try source.lexAtom(context: context) else { break }
453485

454486
// Range between atoms.

Sources/_MatchingEngine/Regex/Parse/SyntaxOptions.swift

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,27 @@ public struct SyntaxOptions: OptionSet {
2323
/// `'a \. b' == '/a\.b/'`
2424
public static var nonSemanticWhitespace: Self { Self(1 << 0) }
2525

26+
/// `abc # comment`
27+
public static var endOfLineComments: Self { Self(1 << 1) }
28+
29+
/// `(?x)` `(?xx)`
30+
public static var extendedSyntax: Self {
31+
[.endOfLineComments, .nonSemanticWhitespace]
32+
}
33+
2634
/// `'a "." b' == '/a\Q.\Eb/'`
2735
///
2836
/// NOTE: Currently, this means we have raw quotes.
2937
/// Better would be to have real Swift string delimiter parsing logic.
30-
public static var experimentalQuotes: Self { Self(1 << 1) }
38+
public static var experimentalQuotes: Self { Self(1 << 2) }
3139

3240
/// `'a /* comment */ b' == '/a(?#. comment )b/'`
3341
///
3442
/// NOTE: traditional comments are not nested. Currently, we are neither.
3543
/// Traditional comments can't have `)`, not even escaped in them either, we
3644
/// can. Traditional comments can have `*/` in them, we can't without
3745
/// escaping. We don't currently do escaping.
38-
public static var experimentalComments: Self { Self(1 << 2) }
46+
public static var experimentalComments: Self { Self(1 << 3) }
3947

4048
/// ```
4149
/// 'a{n...m}' == '/a{n,m}/'
@@ -44,11 +52,11 @@ public struct SyntaxOptions: OptionSet {
4452
/// 'a{...m}' == '/a{,m}/'
4553
/// 'a{..<m}' == '/a{,m-1}/'
4654
/// ```
47-
public static var experimentalRanges: Self { Self(1 << 3) }
55+
public static var experimentalRanges: Self { Self(1 << 4) }
4856

4957
/// `(name: .*)` == `(?<name>.*)`
5058
/// `(_: .*)` == `(?:.*)`
51-
public static var experimentalCaptures: Self { Self(1 << 4) }
59+
public static var experimentalCaptures: Self { Self(1 << 5) }
5260

5361
/*
5462

@@ -59,10 +67,9 @@ public struct SyntaxOptions: OptionSet {
5967

6068
public static var traditional: Self { Self(0) }
6169

62-
public static var experimental: Self { Self(~0) }
63-
64-
public var ignoreWhitespace: Bool {
65-
contains(.nonSemanticWhitespace)
70+
public static var experimental: Self {
71+
// Experimental syntax enables everything except end-of-line comments.
72+
Self(~0).subtracting(.endOfLineComments)
6673
}
6774

6875
// TODO: Probably want to model strict-PCRE etc. options too.

Sources/_MatchingEngine/Regex/Printing/DumpAST.swift

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ extension _ASTPrintable {
5252
if $0.isTrivia { return nil }
5353
return $0._dump()
5454
}.joined(separator: ",")
55+
if sub.isEmpty {
56+
return "\(_dumpBase)"
57+
}
5558
return "\(_dumpBase)(\(sub))"
5659
}
5760
}
@@ -287,7 +290,15 @@ extension AST.Quantification: _ASTPrintable {
287290

288291
extension AST.CustomCharacterClass: _ASTNode {
289292
public var _dumpBase: String {
290-
"customCharacterClass(\(members))"
293+
// Exclude trivia for now, as we don't want it to appear when performing
294+
// comparisons of dumped output in tests.
295+
// TODO: We should eventually have some way of filtering out trivia for
296+
// tests, so that it can appear in regular dumps.
297+
let semanticMembers = members.filter {
298+
if case .trivia = $0 { return false }
299+
return true
300+
}
301+
return "customCharacterClass(\(semanticMembers))"
291302
}
292303
}
293304

@@ -298,6 +309,7 @@ extension AST.CustomCharacterClass.Member: _ASTPrintable {
298309
case .atom(let a): return "\(a)"
299310
case .range(let r): return "\(r)"
300311
case .quote(let q): return "\(q)"
312+
case .trivia(let t): return "\(t)"
301313
case .setOperation(let lhs, let op, let rhs):
302314
return "op \(lhs) \(op.value) \(rhs)"
303315
}

Sources/_MatchingEngine/Regex/Printing/PrintAsCanonical.swift

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,7 @@ extension PrettyPrinter {
9595
output(q._canonicalBase)
9696

9797
case let .trivia(t):
98-
// TODO: We might want to output comments...
99-
_ = t
100-
output("")
98+
output(t._canonicalBase)
10199

102100
case let .atom(a):
103101
output(a._canonicalBase)
@@ -139,6 +137,8 @@ extension PrettyPrinter {
139137
output(a._canonicalBase)
140138
case .quote(let q):
141139
output(q._canonicalBase)
140+
case .trivia(let t):
141+
output(t._canonicalBase)
142142
case .setOperation:
143143
output("/* TODO: set operation \(self) */")
144144
}
@@ -319,3 +319,10 @@ extension AST.GlobalMatchingOption.Kind {
319319
extension AST.GlobalMatchingOption {
320320
var _canonicalBase: String { "(*\(kind._canonicalBase))"}
321321
}
322+
323+
extension AST.Trivia {
324+
var _canonicalBase: String {
325+
// TODO: We might want to output comments...
326+
""
327+
}
328+
}

Sources/_MatchingEngine/Regex/Printing/PrintAsPattern.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,9 @@ extension PrettyPrinter {
180180
}
181181
case .quote(let q):
182182
print("// TODO: quote \(q.literal._quoted) in custom character classes (should we split it?)")
183+
case .trivia(let t):
184+
// TODO: We might want to output comments...
185+
_ = t
183186
case .setOperation:
184187
print("// TODO: Set operation: \(member)")
185188
}

0 commit comments

Comments
 (0)