Skip to content

Commit 88bd2df

Browse files
authored
Merge pull request #136 from hamishknight/so-extra
2 parents e719d6d + c389e5a commit 88bd2df

17 files changed

+493
-144
lines changed

Sources/_MatchingEngine/Regex/AST/CustomCharClass.swift

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ extension AST {
4242
/// the contents should be interpreted literally.
4343
case quote(Quote)
4444

45+
/// Trivia such as non-semantic whitespace.
46+
case trivia(Trivia)
47+
4548
/// A binary operator applied to sets of members `abc&&def`
4649
case setOperation([Member], Located<SetOp>, [Member])
4750
}
@@ -81,11 +84,27 @@ extension CustomCC.Member {
8184
case .range(let r): return r
8285
case .atom(let a): return a
8386
case .quote(let q): return q
87+
case .trivia(let t): return t
8488
case .setOperation(let lhs, let op, let rhs): return (lhs, op, rhs)
8589
}
8690
}
8791

8892
func `as`<T>(_ t: T.Type = T.self) -> T? {
8993
_associatedValue as? T
9094
}
95+
96+
public var isTrivia: Bool {
97+
if case .trivia = self { return true }
98+
return false
99+
}
100+
}
101+
102+
extension AST.CustomCharacterClass {
103+
/// Strip trivia from the character class members. This does not recurse into
104+
/// nested custom character classes.
105+
public var strippingTriviaShallow: Self {
106+
var copy = self
107+
copy.members = copy.members.filter { !$0.isTrivia }
108+
return copy
109+
}
91110
}

Sources/_MatchingEngine/Regex/AST/MatchingOptions.swift

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,16 @@ extension AST {
5050
self.location = location
5151
}
5252

53+
/// If this is either the regular or extra extended syntax option.
54+
public var isAnyExtended: Bool {
55+
switch kind {
56+
case .extended, .extraExtended:
57+
return true
58+
default:
59+
return false
60+
}
61+
}
62+
5363
public var isTextSegmentMode: Bool {
5464
switch kind {
5565
case .textSegmentGraphemeMode, .textSegmentWordMode:
@@ -93,6 +103,10 @@ extension AST {
93103
self.minusLoc = minusLoc
94104
self.removing = removing
95105
}
106+
107+
/// Whether this set of matching options first resets the options before
108+
/// adding onto them.
109+
public var resetsCurrentOptions: Bool { caretLoc != nil }
96110
}
97111
}
98112

@@ -102,7 +116,10 @@ extension AST.MatchingOption: _ASTPrintable {
102116

103117
extension AST.MatchingOptionSequence: _ASTPrintable {
104118
public var _dumpBase: String {
105-
"adding: \(adding), removing: \(removing), hasCaret: \(caretLoc != nil)"
119+
"""
120+
adding: \(adding), removing: \(removing), \
121+
resetsCurrentOptions: \(resetsCurrentOptions)
122+
"""
106123
}
107124
}
108125

Sources/_MatchingEngine/Regex/AST/Quantification.swift

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,23 @@ extension AST {
1717
public let child: AST.Node
1818
public let location: SourceLocation
1919

20+
/// Any trivia intermixed between the operand and the quantifier, as well
21+
/// as between the quantifier characters themselves. This can occur in
22+
/// extended syntax mode where PCRE permits e.g `x * +`.
23+
public let trivia: [AST.Trivia]
24+
2025
public init(
2126
_ amount: Located<Amount>,
2227
_ kind: Located<Kind>,
2328
_ child: AST.Node,
24-
_ r: SourceLocation
29+
_ r: SourceLocation,
30+
trivia: [AST.Trivia]
2531
) {
2632
self.amount = amount
2733
self.kind = kind
2834
self.child = child
2935
self.location = r
36+
self.trivia = trivia
3037
}
3138

3239
@frozen

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 85 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -332,29 +332,42 @@ extension Source {
332332
/// Quantifier -> ('*' | '+' | '?' | '{' Range '}') QuantKind?
333333
/// QuantKind -> '?' | '+'
334334
///
335-
mutating func lexQuantifier() throws -> (
336-
Located<Quant.Amount>, Located<Quant.Kind>
337-
)? {
335+
mutating func lexQuantifier(
336+
context: ParsingContext
337+
) throws -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
338+
var trivia: [AST.Trivia] = []
339+
340+
if let t = try lexNonSemanticWhitespace(context: context) {
341+
trivia.append(t)
342+
}
343+
338344
let amt: Located<Quant.Amount>? = try recordLoc { src in
339345
if src.tryEat("*") { return .zeroOrMore }
340346
if src.tryEat("+") { return .oneOrMore }
341347
if src.tryEat("?") { return .zeroOrOne }
342348

343349
return try src.tryEating { src in
344-
guard src.tryEat("{"), let range = try src.lexRange(), src.tryEat("}")
350+
guard src.tryEat("{"),
351+
let range = try src.lexRange(context: context),
352+
src.tryEat("}")
345353
else { return nil }
346354
return range.value
347355
}
348356
}
349357
guard let amt = amt else { return nil }
350358

359+
// PCRE allows non-semantic whitespace here in extended syntax mode.
360+
if let t = try lexNonSemanticWhitespace(context: context) {
361+
trivia.append(t)
362+
}
363+
351364
let kind: Located<Quant.Kind> = recordLoc { src in
352365
if src.tryEat("?") { return .reluctant }
353366
if src.tryEat("+") { return .possessive }
354367
return .eager
355368
}
356369

357-
return (amt, kind)
370+
return (amt, kind, trivia)
358371
}
359372

360373
/// Try to consume a range, returning `nil` if unsuccessful.
@@ -363,7 +376,7 @@ extension Source {
363376
/// | ExpRange
364377
/// ExpRange -> '..<' <Int> | '...' <Int>
365378
/// | <Int> '..<' <Int> | <Int> '...' <Int>?
366-
mutating func lexRange() throws -> Located<Quant.Amount>? {
379+
mutating func lexRange(context: ParsingContext) throws -> Located<Quant.Amount>? {
367380
try recordLoc { src in
368381
try src.tryEating { src in
369382
let lowerOpt = try src.lexNumber()
@@ -375,7 +388,7 @@ extension Source {
375388
let closedRange: Bool?
376389
if src.tryEat(",") {
377390
closedRange = true
378-
} else if src.experimentalRanges && src.tryEat(".") {
391+
} else if context.experimentalRanges && src.tryEat(".") {
379392
try src.expect(".")
380393
if src.tryEat(".") {
381394
closedRange = true
@@ -477,12 +490,12 @@ extension Source {
477490
///
478491
/// TODO: Need to support some escapes
479492
///
480-
mutating func lexQuote() throws -> AST.Quote? {
493+
mutating func lexQuote(context: ParsingContext) throws -> AST.Quote? {
481494
let str = try recordLoc { src -> String? in
482495
if src.tryEat(sequence: #"\Q"#) {
483496
return try src.expectQuoted(endingWith: #"\E"#).value
484497
}
485-
if src.experimentalQuotes, src.tryEat("\"") {
498+
if context.experimentalQuotes, src.tryEat("\"") {
486499
return try src.expectQuoted(endingWith: "\"", ignoreEscaped: true).value
487500
}
488501
return nil
@@ -499,16 +512,27 @@ extension Source {
499512
///
500513
/// ExpComment -> '/*' (!'*/' .)* '*/'
501514
///
515+
/// With `SyntaxOptions.endOfLineComments`
516+
///
517+
/// EndOfLineComment -> '#' .*
518+
///
502519
/// TODO: Swift-style nested comments, line-ending comments, etc
503520
///
504-
mutating func lexComment() throws -> AST.Trivia? {
521+
mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? {
505522
let trivia: Located<String>? = try recordLoc { src in
506523
if src.tryEat(sequence: "(?#") {
507524
return try src.expectQuoted(endingWith: ")").value
508525
}
509-
if src.experimentalComments, src.tryEat(sequence: "/*") {
526+
if context.experimentalComments, src.tryEat(sequence: "/*") {
510527
return try src.expectQuoted(endingWith: "*/").value
511528
}
529+
if context.endOfLineComments, src.tryEat("#") {
530+
// TODO: If we ever support multi-line regex literals, this will need
531+
// to be updated to stop at a newline. Note though that PCRE specifies
532+
// that the newline it matches against can be controlled by the global
533+
// matching options e.g `(*CR)`, `(*ANY)`, ...
534+
return src.lexUntil(\.isEmpty).value
535+
}
512536
return nil
513537
}
514538
guard let trivia = trivia else { return nil }
@@ -517,16 +541,55 @@ extension Source {
517541

518542
/// Try to consume non-semantic whitespace as trivia
519543
///
544+
/// Whitespace -> WhitespaceChar+
545+
///
520546
/// Does nothing unless `SyntaxOptions.nonSemanticWhitespace` is set
521-
mutating func lexNonSemanticWhitespace() throws -> AST.Trivia? {
522-
guard syntax.ignoreWhitespace else { return nil }
547+
mutating func lexNonSemanticWhitespace(
548+
context: ParsingContext
549+
) throws -> AST.Trivia? {
550+
guard context.ignoreWhitespace else { return nil }
551+
552+
func isWhitespace(_ c: Character) -> Bool {
553+
// This is a list of characters that PCRE treats as whitespace when
554+
// compiled with Unicode support. It is a subset of the characters with
555+
// the `.isWhitespace` property. ICU appears to also follow this list.
556+
// Oniguruma and .NET follow a subset of this list.
557+
//
558+
// FIXME: PCRE only treats space and tab characters as whitespace when
559+
// inside a custom character class (and only treats whitespace as
560+
// non-semantic there for the extra-extended `(?xx)` mode). If we get a
561+
// strict-PCRE mode, we'll need to add a case for that.
562+
switch c {
563+
case " ", "\u{9}"..."\u{D}", // space, \t, \n, vertical tab, \f, \r
564+
"\u{85}", "\u{200E}", // next line, left-to-right mark
565+
"\u{200F}", "\u{2028}", // right-to-left-mark, line separator
566+
"\u{2029}": // paragraph separator
567+
return true
568+
default:
569+
return false
570+
}
571+
}
523572
let trivia: Located<String>? = recordLoc { src in
524-
src.tryEatPrefix { $0 == " " }?.string
573+
src.tryEatPrefix(isWhitespace)?.string
525574
}
526575
guard let trivia = trivia else { return nil }
527576
return AST.Trivia(trivia)
528577
}
529578

579+
/// Try to consume trivia.
580+
///
581+
/// Trivia -> Comment | Whitespace
582+
///
583+
mutating func lexTrivia(context: ParsingContext) throws -> AST.Trivia? {
584+
if let comment = try lexComment(context: context) {
585+
return comment
586+
}
587+
if let whitespace = try lexNonSemanticWhitespace(context: context) {
588+
return whitespace
589+
}
590+
return nil
591+
}
592+
530593
/// Try to lex a matching option.
531594
///
532595
/// MatchingOption -> 'i' | 'J' | 'm' | 'n' | 's' | 'U' | 'x' | 'xx' | 'w'
@@ -761,6 +824,7 @@ extension Source {
761824
/// comments, like quotes, cannot be quantified.
762825
///
763826
mutating func lexGroupStart(
827+
context: ParsingContext
764828
) throws -> Located<AST.Group.Kind>? {
765829
try recordLoc { src in
766830
try src.tryEating { src in
@@ -825,7 +889,7 @@ extension Source {
825889
}
826890

827891
// (_:)
828-
if src.experimentalCaptures && src.tryEat(sequence: "_:") {
892+
if context.experimentalCaptures && src.tryEat(sequence: "_:") {
829893
return .nonCapture
830894
}
831895
// TODO: (name:)
@@ -960,9 +1024,12 @@ extension Source {
9601024
///
9611025
/// GroupConditionalStart -> '(?' GroupStart
9621026
///
963-
mutating func lexGroupConditionalStart() throws -> Located<AST.Group.Kind>? {
1027+
mutating func lexGroupConditionalStart(
1028+
context: ParsingContext
1029+
) throws -> Located<AST.Group.Kind>? {
9641030
try tryEating { src in
965-
guard src.tryEat(sequence: "(?"), let group = try src.lexGroupStart()
1031+
guard src.tryEat(sequence: "(?"),
1032+
let group = try src.lexGroupStart(context: context)
9661033
else { return nil }
9671034

9681035
// Implicitly scoped groups are not supported here.
@@ -1607,7 +1674,7 @@ extension Source {
16071674
var name: Located<String>?
16081675
if src.tryEat(":") {
16091676
// TODO: PCRE allows escaped delimiters or '\Q...\E' sequences in the
1610-
// name under PCRE2_ALT_VERBNAMES.
1677+
// name under PCRE2_ALT_VERBNAMES. It also allows whitespace under (?x).
16111678
name = try src.expectQuoted(endingWith: ")", eatEnding: false)
16121679
}
16131680
try src.expect(")")

0 commit comments

Comments
 (0)