@@ -342,8 +342,8 @@ extension Source {
342
342
} . value
343
343
}
344
344
345
- /// Eat a scalar off the front, starting from after the
346
- /// backslash and base character (e.g. `\u` or `\x`).
345
+ /// Try to eat a scalar off the front, starting from after the backslash and
346
+ /// base character (e.g. `\u` or `\x`).
347
347
///
348
348
/// UniScalar -> 'u{' UniScalarSequence '}'
349
349
/// | 'u' HexDigit{4}
@@ -353,60 +353,60 @@ extension Source {
353
353
/// | 'o{' OctalDigit{1...} '}'
354
354
/// | '0' OctalDigit{0...3}
355
355
///
356
- mutating func expectUnicodeScalar(
357
- escapedCharacter base: Character
358
- ) throws -> AST . Atom . Kind {
356
+ mutating func lexUnicodeScalar( ) throws -> AST . Atom . Kind ? {
359
357
try recordLoc { src in
358
+ try src. tryEating { src in
360
359
361
- func nullScalar( ) -> AST . Atom . Kind {
362
- let pos = src. currentPosition
363
- return . scalar( . init( UnicodeScalar ( 0 ) , SourceLocation ( pos ..< pos) ) )
364
- }
365
-
366
- // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
367
- switch base {
368
- // Hex numbers.
369
- case " u " where src. tryEat ( " { " ) :
370
- return try src. expectUnicodeScalarSequence ( eating: " } " )
371
-
372
- case " x " where src. tryEat ( " { " ) :
373
- let str = try src. lexUntil ( eating: " } " )
374
- return . scalar( try Source . validateUnicodeScalar ( str, . hex) )
375
-
376
- case " x " :
377
- // \x expects *up to* 2 digits.
378
- guard let digits = src. tryEatLocatedPrefix ( maxLength: 2 , \. isHexDigit)
379
- else {
380
- // In PCRE, \x without any valid hex digits is \u{0}.
381
- // TODO: This doesn't appear to be followed by ICU or Oniguruma, so
382
- // could be changed to throw an error if we had a parsing mode for
383
- // them.
384
- return nullScalar ( )
360
+ func nullScalar( ) -> AST . Atom . Kind {
361
+ let pos = src. currentPosition
362
+ return . scalar( . init( UnicodeScalar ( 0 ) , SourceLocation ( pos ..< pos) ) )
385
363
}
386
- return . scalar( try Source . validateUnicodeScalar ( digits, . hex) )
387
364
388
- case " u " :
389
- return . scalar( try src. expectUnicodeScalar ( numDigits: 4 ) )
390
- case " U " :
391
- return . scalar( try src. expectUnicodeScalar ( numDigits: 8 ) )
365
+ // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
366
+ switch src. tryEat ( ) {
367
+ // Hex numbers.
368
+ case " u " where src. tryEat ( " { " ) :
369
+ return try src. expectUnicodeScalarSequence ( eating: " } " )
370
+
371
+ case " x " where src. tryEat ( " { " ) :
372
+ let str = try src. lexUntil ( eating: " } " )
373
+ return . scalar( try Source . validateUnicodeScalar ( str, . hex) )
374
+
375
+ case " x " :
376
+ // \x expects *up to* 2 digits.
377
+ guard let digits = src. tryEatLocatedPrefix ( maxLength: 2 , \. isHexDigit)
378
+ else {
379
+ // In PCRE, \x without any valid hex digits is \u{0}.
380
+ // TODO: This doesn't appear to be followed by ICU or Oniguruma, so
381
+ // could be changed to throw an error if we had a parsing mode for
382
+ // them.
383
+ return nullScalar ( )
384
+ }
385
+ return . scalar( try Source . validateUnicodeScalar ( digits, . hex) )
386
+
387
+ case " u " :
388
+ return . scalar( try src. expectUnicodeScalar ( numDigits: 4 ) )
389
+ case " U " :
390
+ return . scalar( try src. expectUnicodeScalar ( numDigits: 8 ) )
391
+
392
+ // Octal numbers.
393
+ case " o " where src. tryEat ( " { " ) :
394
+ let str = try src. lexUntil ( eating: " } " )
395
+ return . scalar( try Source . validateUnicodeScalar ( str, . octal) )
396
+
397
+ case " 0 " :
398
+ // We can read *up to* 3 more octal digits.
399
+ // FIXME: PCRE can only read up to 2 octal digits, if we get a strict
400
+ // PCRE mode, we should limit it here.
401
+ guard let digits = src. tryEatLocatedPrefix ( maxLength: 3 , \. isOctalDigit)
402
+ else {
403
+ return nullScalar ( )
404
+ }
405
+ return . scalar( try Source . validateUnicodeScalar ( digits, . octal) )
392
406
393
- // Octal numbers.
394
- case " o " where src. tryEat ( " { " ) :
395
- let str = try src. lexUntil ( eating: " } " )
396
- return . scalar( try Source . validateUnicodeScalar ( str, . octal) )
397
-
398
- case " 0 " :
399
- // We can read *up to* 3 more octal digits.
400
- // FIXME: PCRE can only read up to 2 octal digits, if we get a strict
401
- // PCRE mode, we should limit it here.
402
- guard let digits = src. tryEatLocatedPrefix ( maxLength: 3 , \. isOctalDigit)
403
- else {
404
- return nullScalar ( )
407
+ default :
408
+ return nil
405
409
}
406
- return . scalar( try Source . validateUnicodeScalar ( digits, . octal) )
407
-
408
- default :
409
- fatalError ( " Unexpected scalar start " )
410
410
}
411
411
} . value
412
412
}
@@ -802,6 +802,11 @@ extension Source {
802
802
mutating func lexMatchingOptionSequence(
803
803
context: ParsingContext
804
804
) throws -> AST . MatchingOptionSequence ? {
805
+ // PCRE accepts '(?)'
806
+ // TODO: This is a no-op, should we warn?
807
+ if peek ( ) == " ) " {
808
+ return . init( caretLoc: nil , adding: [ ] , minusLoc: nil , removing: [ ] )
809
+ }
805
810
let ateCaret = recordLoc { $0. tryEat ( " ^ " ) }
806
811
807
812
// TODO: Warn on duplicate options, and options appearing in both adding
@@ -1707,6 +1712,11 @@ extension Source {
1707
1712
return ref
1708
1713
}
1709
1714
1715
+ // Hexadecimal and octal unicode scalars.
1716
+ if let scalar = try src. lexUnicodeScalar ( ) {
1717
+ return scalar
1718
+ }
1719
+
1710
1720
guard let char = src. tryEat ( ) else {
1711
1721
throw ParseError . expectedEscape
1712
1722
}
@@ -1718,14 +1728,6 @@ extension Source {
1718
1728
return . escaped( builtin)
1719
1729
}
1720
1730
1721
- switch char {
1722
- // Hexadecimal and octal unicode scalars.
1723
- case " u " , " x " , " U " , " o " , " 0 " :
1724
- return try src. expectUnicodeScalar ( escapedCharacter: char)
1725
- default :
1726
- break
1727
- }
1728
-
1729
1731
// We only allow unknown escape sequences for non-letter non-number ASCII,
1730
1732
// and non-ASCII whitespace.
1731
1733
// TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`.
0 commit comments