@@ -332,29 +332,42 @@ extension Source {
332
332
/// Quantifier -> ('*' | '+' | '?' | '{' Range '}') QuantKind?
333
333
/// QuantKind -> '?' | '+'
334
334
///
335
- mutating func lexQuantifier( ) throws -> (
336
- Located < Quant . Amount > , Located < Quant . Kind >
337
- ) ? {
335
+ mutating func lexQuantifier(
336
+ context: ParsingContext
337
+ ) throws -> ( Located < Quant . Amount > , Located < Quant . Kind > , [ AST . Trivia ] ) ? {
338
+ var trivia : [ AST . Trivia ] = [ ]
339
+
340
+ if let t = try lexNonSemanticWhitespace ( context: context) {
341
+ trivia. append ( t)
342
+ }
343
+
338
344
let amt : Located < Quant . Amount > ? = try recordLoc { src in
339
345
if src. tryEat ( " * " ) { return . zeroOrMore }
340
346
if src. tryEat ( " + " ) { return . oneOrMore }
341
347
if src. tryEat ( " ? " ) { return . zeroOrOne }
342
348
343
349
return try src. tryEating { src in
344
- guard src. tryEat ( " { " ) , let range = try src. lexRange ( ) , src. tryEat ( " } " )
350
+ guard src. tryEat ( " { " ) ,
351
+ let range = try src. lexRange ( context: context) ,
352
+ src. tryEat ( " } " )
345
353
else { return nil }
346
354
return range. value
347
355
}
348
356
}
349
357
guard let amt = amt else { return nil }
350
358
359
+ // PCRE allows non-semantic whitespace here in extended syntax mode.
360
+ if let t = try lexNonSemanticWhitespace ( context: context) {
361
+ trivia. append ( t)
362
+ }
363
+
351
364
let kind : Located < Quant . Kind > = recordLoc { src in
352
365
if src. tryEat ( " ? " ) { return . reluctant }
353
366
if src. tryEat ( " + " ) { return . possessive }
354
367
return . eager
355
368
}
356
369
357
- return ( amt, kind)
370
+ return ( amt, kind, trivia )
358
371
}
359
372
360
373
/// Try to consume a range, returning `nil` if unsuccessful.
@@ -363,7 +376,7 @@ extension Source {
363
376
/// | ExpRange
364
377
/// ExpRange -> '..<' <Int> | '...' <Int>
365
378
/// | <Int> '..<' <Int> | <Int> '...' <Int>?
366
- mutating func lexRange( ) throws -> Located < Quant . Amount > ? {
379
+ mutating func lexRange( context : ParsingContext ) throws -> Located < Quant . Amount > ? {
367
380
try recordLoc { src in
368
381
try src. tryEating { src in
369
382
let lowerOpt = try src. lexNumber ( )
@@ -375,7 +388,7 @@ extension Source {
375
388
let closedRange : Bool ?
376
389
if src. tryEat ( " , " ) {
377
390
closedRange = true
378
- } else if src . experimentalRanges && src. tryEat ( " . " ) {
391
+ } else if context . experimentalRanges && src. tryEat ( " . " ) {
379
392
try src. expect ( " . " )
380
393
if src. tryEat ( " . " ) {
381
394
closedRange = true
@@ -477,12 +490,12 @@ extension Source {
477
490
///
478
491
/// TODO: Need to support some escapes
479
492
///
480
- mutating func lexQuote( ) throws -> AST . Quote ? {
493
+ mutating func lexQuote( context : ParsingContext ) throws -> AST . Quote ? {
481
494
let str = try recordLoc { src -> String ? in
482
495
if src. tryEat ( sequence: #"\Q"# ) {
483
496
return try src. expectQuoted ( endingWith: #"\E"# ) . value
484
497
}
485
- if src . experimentalQuotes, src. tryEat ( " \" " ) {
498
+ if context . experimentalQuotes, src. tryEat ( " \" " ) {
486
499
return try src. expectQuoted ( endingWith: " \" " , ignoreEscaped: true ) . value
487
500
}
488
501
return nil
@@ -499,16 +512,27 @@ extension Source {
499
512
///
500
513
/// ExpComment -> '/*' (!'*/' .)* '*/'
501
514
///
515
+ /// With `SyntaxOptions.endOfLineComments`
516
+ ///
517
+ /// EndOfLineComment -> '#' .*
518
+ ///
502
519
/// TODO: Swift-style nested comments, line-ending comments, etc
503
520
///
504
- mutating func lexComment( ) throws -> AST . Trivia ? {
521
+ mutating func lexComment( context : ParsingContext ) throws -> AST . Trivia ? {
505
522
let trivia : Located < String > ? = try recordLoc { src in
506
523
if src. tryEat ( sequence: " (?# " ) {
507
524
return try src. expectQuoted ( endingWith: " ) " ) . value
508
525
}
509
- if src . experimentalComments, src. tryEat ( sequence: " /*") {
526
+ if context . experimentalComments, src. tryEat ( sequence: " /*") {
510
527
return try src.expectQuoted(endingWith: "*/" ) . value
511
528
}
529
+ if context. endOfLineComments, src. tryEat ( " # " ) {
530
+ // TODO: If we ever support multi-line regex literals, this will need
531
+ // to be updated to stop at a newline. Note though that PCRE specifies
532
+ // that the newline it matches against can be controlled by the global
533
+ // matching options e.g `(*CR)`, `(*ANY)`, ...
534
+ return src. lexUntil ( \. isEmpty) . value
535
+ }
512
536
return nil
513
537
}
514
538
guard let trivia = trivia else { return nil }
@@ -517,16 +541,55 @@ extension Source {
517
541
518
542
/// Try to consume non-semantic whitespace as trivia
519
543
///
544
+ /// Whitespace -> WhitespaceChar+
545
+ ///
520
546
/// Does nothing unless `SyntaxOptions.nonSemanticWhitespace` is set
521
- mutating func lexNonSemanticWhitespace( ) throws -> AST . Trivia ? {
522
- guard syntax. ignoreWhitespace else { return nil }
547
+ mutating func lexNonSemanticWhitespace(
548
+ context: ParsingContext
549
+ ) throws -> AST . Trivia ? {
550
+ guard context. ignoreWhitespace else { return nil }
551
+
552
+ func isWhitespace( _ c: Character ) -> Bool {
553
+ // This is a list of characters that PCRE treats as whitespace when
554
+ // compiled with Unicode support. It is a subset of the characters with
555
+ // the `.isWhitespace` property. ICU appears to also follow this list.
556
+ // Oniguruma and .NET follow a subset of this list.
557
+ //
558
+ // FIXME: PCRE only treats space and tab characters as whitespace when
559
+ // inside a custom character class (and only treats whitespace as
560
+ // non-semantic there for the extra-extended `(?xx)` mode). If we get a
561
+ // strict-PCRE mode, we'll need to add a case for that.
562
+ switch c {
563
+ case " " , " \u{9} " ... " \u{D} " , // space, \t, \n, vertical tab, \f, \r
564
+ " \u{85} " , " \u{200E} " , // next line, left-to-right mark
565
+ " \u{200F} " , " \u{2028} " , // right-to-left-mark, line separator
566
+ " \u{2029} " : // paragraph separator
567
+ return true
568
+ default :
569
+ return false
570
+ }
571
+ }
523
572
let trivia : Located < String > ? = recordLoc { src in
524
- src. tryEatPrefix { $0 == " " } ? . string
573
+ src. tryEatPrefix ( isWhitespace ) ? . string
525
574
}
526
575
guard let trivia = trivia else { return nil }
527
576
return AST . Trivia ( trivia)
528
577
}
529
578
579
+ /// Try to consume trivia.
580
+ ///
581
+ /// Trivia -> Comment | Whitespace
582
+ ///
583
+ mutating func lexTrivia( context: ParsingContext ) throws -> AST . Trivia ? {
584
+ if let comment = try lexComment ( context: context) {
585
+ return comment
586
+ }
587
+ if let whitespace = try lexNonSemanticWhitespace ( context: context) {
588
+ return whitespace
589
+ }
590
+ return nil
591
+ }
592
+
530
593
/// Try to lex a matching option.
531
594
///
532
595
/// MatchingOption -> 'i' | 'J' | 'm' | 'n' | 's' | 'U' | 'x' | 'xx' | 'w'
@@ -761,6 +824,7 @@ extension Source {
761
824
/// comments, like quotes, cannot be quantified.
762
825
///
763
826
mutating func lexGroupStart(
827
+ context: ParsingContext
764
828
) throws -> Located < AST . Group . Kind > ? {
765
829
try recordLoc { src in
766
830
try src. tryEating { src in
@@ -825,7 +889,7 @@ extension Source {
825
889
}
826
890
827
891
// (_:)
828
- if src . experimentalCaptures && src. tryEat ( sequence: " _: " ) {
892
+ if context . experimentalCaptures && src. tryEat ( sequence: " _: " ) {
829
893
return . nonCapture
830
894
}
831
895
// TODO: (name:)
@@ -960,9 +1024,12 @@ extension Source {
960
1024
///
961
1025
/// GroupConditionalStart -> '(?' GroupStart
962
1026
///
963
- mutating func lexGroupConditionalStart( ) throws -> Located < AST . Group . Kind > ? {
1027
+ mutating func lexGroupConditionalStart(
1028
+ context: ParsingContext
1029
+ ) throws -> Located < AST . Group . Kind > ? {
964
1030
try tryEating { src in
965
- guard src. tryEat ( sequence: " (? " ) , let group = try src. lexGroupStart ( )
1031
+ guard src. tryEat ( sequence: " (? " ) ,
1032
+ let group = try src. lexGroupStart ( context: context)
966
1033
else { return nil }
967
1034
968
1035
// Implicitly scoped groups are not supported here.
@@ -1607,7 +1674,7 @@ extension Source {
1607
1674
var name : Located < String > ?
1608
1675
if src. tryEat ( " : " ) {
1609
1676
// TODO: PCRE allows escaped delimiters or '\Q...\E' sequences in the
1610
- // name under PCRE2_ALT_VERBNAMES.
1677
+ // name under PCRE2_ALT_VERBNAMES. It also allows whitespace under (?x).
1611
1678
name = try src. expectQuoted ( endingWith: " ) " , eatEnding: false )
1612
1679
}
1613
1680
try src. expect ( " ) " )
0 commit comments