Skip to content

Commit 0878029

Browse files
authored
Merge pull request #487 from hamishknight/separate-these
2 parents 55ffc5c + e861af5 commit 0878029

File tree

3 files changed

+70
-93
lines changed

3 files changed

+70
-93
lines changed

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 60 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -342,8 +342,8 @@ extension Source {
342342
}.value
343343
}
344344

345-
/// Eat a scalar off the front, starting from after the
346-
/// backslash and base character (e.g. `\u` or `\x`).
345+
/// Try to eat a scalar off the front, starting from after the backslash and
346+
/// base character (e.g. `\u` or `\x`).
347347
///
348348
/// UniScalar -> 'u{' UniScalarSequence '}'
349349
/// | 'u' HexDigit{4}
@@ -353,60 +353,60 @@ extension Source {
353353
/// | 'o{' OctalDigit{1...} '}'
354354
/// | '0' OctalDigit{0...3}
355355
///
356-
mutating func expectUnicodeScalar(
357-
escapedCharacter base: Character
358-
) throws -> AST.Atom.Kind {
356+
mutating func lexUnicodeScalar() throws -> AST.Atom.Kind? {
359357
try recordLoc { src in
358+
try src.tryEating { src in
360359

361-
func nullScalar() -> AST.Atom.Kind {
362-
let pos = src.currentPosition
363-
return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
364-
}
365-
366-
// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
367-
switch base {
368-
// Hex numbers.
369-
case "u" where src.tryEat("{"):
370-
return try src.expectUnicodeScalarSequence(eating: "}")
371-
372-
case "x" where src.tryEat("{"):
373-
let str = try src.lexUntil(eating: "}")
374-
return .scalar(try Source.validateUnicodeScalar(str, .hex))
375-
376-
case "x":
377-
// \x expects *up to* 2 digits.
378-
guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
379-
else {
380-
// In PCRE, \x without any valid hex digits is \u{0}.
381-
// TODO: This doesn't appear to be followed by ICU or Oniguruma, so
382-
// could be changed to throw an error if we had a parsing mode for
383-
// them.
384-
return nullScalar()
360+
func nullScalar() -> AST.Atom.Kind {
361+
let pos = src.currentPosition
362+
return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
385363
}
386-
return .scalar(try Source.validateUnicodeScalar(digits, .hex))
387364

388-
case "u":
389-
return .scalar(try src.expectUnicodeScalar(numDigits: 4))
390-
case "U":
391-
return .scalar(try src.expectUnicodeScalar(numDigits: 8))
365+
// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
366+
switch src.tryEat() {
367+
// Hex numbers.
368+
case "u" where src.tryEat("{"):
369+
return try src.expectUnicodeScalarSequence(eating: "}")
370+
371+
case "x" where src.tryEat("{"):
372+
let str = try src.lexUntil(eating: "}")
373+
return .scalar(try Source.validateUnicodeScalar(str, .hex))
374+
375+
case "x":
376+
// \x expects *up to* 2 digits.
377+
guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
378+
else {
379+
// In PCRE, \x without any valid hex digits is \u{0}.
380+
// TODO: This doesn't appear to be followed by ICU or Oniguruma, so
381+
// could be changed to throw an error if we had a parsing mode for
382+
// them.
383+
return nullScalar()
384+
}
385+
return .scalar(try Source.validateUnicodeScalar(digits, .hex))
386+
387+
case "u":
388+
return .scalar(try src.expectUnicodeScalar(numDigits: 4))
389+
case "U":
390+
return .scalar(try src.expectUnicodeScalar(numDigits: 8))
391+
392+
// Octal numbers.
393+
case "o" where src.tryEat("{"):
394+
let str = try src.lexUntil(eating: "}")
395+
return .scalar(try Source.validateUnicodeScalar(str, .octal))
396+
397+
case "0":
398+
// We can read *up to* 3 more octal digits.
399+
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
400+
// PCRE mode, we should limit it here.
401+
guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
402+
else {
403+
return nullScalar()
404+
}
405+
return .scalar(try Source.validateUnicodeScalar(digits, .octal))
392406

393-
// Octal numbers.
394-
case "o" where src.tryEat("{"):
395-
let str = try src.lexUntil(eating: "}")
396-
return .scalar(try Source.validateUnicodeScalar(str, .octal))
397-
398-
case "0":
399-
// We can read *up to* 3 more octal digits.
400-
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
401-
// PCRE mode, we should limit it here.
402-
guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
403-
else {
404-
return nullScalar()
407+
default:
408+
return nil
405409
}
406-
return .scalar(try Source.validateUnicodeScalar(digits, .octal))
407-
408-
default:
409-
fatalError("Unexpected scalar start")
410410
}
411411
}.value
412412
}
@@ -802,6 +802,11 @@ extension Source {
802802
mutating func lexMatchingOptionSequence(
803803
context: ParsingContext
804804
) throws -> AST.MatchingOptionSequence? {
805+
// PCRE accepts '(?)'
806+
// TODO: This is a no-op, should we warn?
807+
if peek() == ")" {
808+
return .init(caretLoc: nil, adding: [], minusLoc: nil, removing: [])
809+
}
805810
let ateCaret = recordLoc { $0.tryEat("^") }
806811

807812
// TODO: Warn on duplicate options, and options appearing in both adding
@@ -1707,6 +1712,11 @@ extension Source {
17071712
return ref
17081713
}
17091714

1715+
// Hexadecimal and octal unicode scalars.
1716+
if let scalar = try src.lexUnicodeScalar() {
1717+
return scalar
1718+
}
1719+
17101720
guard let char = src.tryEat() else {
17111721
throw ParseError.expectedEscape
17121722
}
@@ -1718,14 +1728,6 @@ extension Source {
17181728
return .escaped(builtin)
17191729
}
17201730

1721-
switch char {
1722-
// Hexadecimal and octal unicode scalars.
1723-
case "u", "x", "U", "o", "0":
1724-
return try src.expectUnicodeScalar(escapedCharacter: char)
1725-
default:
1726-
break
1727-
}
1728-
17291731
// We only allow unknown escape sequences for non-letter non-number ASCII,
17301732
// and non-ASCII whitespace.
17311733
// TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`.

Tests/RegexTests/LexTests.swift

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -61,41 +61,6 @@ extension RegexTests {
6161
_ = try src.lexNumber()
6262
}
6363

64-
func diagnoseUniScalarOverflow(_ input: String, base: Character) {
65-
let scalars = input.first == "{"
66-
? String(input.dropFirst().dropLast())
67-
: input
68-
diagnose(
69-
input,
70-
expecting: .numberOverflow(scalars)
71-
) { src in
72-
_ = try src.expectUnicodeScalar(escapedCharacter: base)
73-
}
74-
}
75-
func diagnoseUniScalar(
76-
_ input: String,
77-
base: Character,
78-
expectedDigits numDigits: Int
79-
) {
80-
let scalars = input.first == "{"
81-
? String(input.dropFirst().dropLast())
82-
: input
83-
diagnose(
84-
input,
85-
expecting: .expectedNumDigits(scalars, numDigits)
86-
) { src in
87-
_ = try src.expectUnicodeScalar(escapedCharacter: base)
88-
}
89-
_ = scalars
90-
}
91-
92-
diagnoseUniScalar(
93-
"12", base: "u", expectedDigits: 4)
94-
diagnoseUniScalar(
95-
"12", base: "U", expectedDigits: 8)
96-
diagnoseUniScalarOverflow("{123456789}", base: "u")
97-
diagnoseUniScalarOverflow("{123456789}", base: "x")
98-
9964
// TODO: want to dummy print out source ranges, etc, test that.
10065
}
10166

Tests/RegexTests/ParseTests.swift

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1002,6 +1002,9 @@ extension RegexTests {
10021002
concat("a", atomicScriptRun("b"), "c"), throwsError: .unsupported)
10031003

10041004
// Matching option changing groups.
1005+
parseTest("(?)", changeMatchingOptions(
1006+
matchingOptions()
1007+
))
10051008
parseTest("(?-)", changeMatchingOptions(
10061009
matchingOptions()
10071010
))
@@ -2666,6 +2669,8 @@ extension RegexTests {
26662669

26672670
diagnosticTest("\\", .expectedEscape)
26682671

2672+
diagnosticTest(#"\o"#, .invalidEscape("o"))
2673+
26692674
// TODO: Custom diagnostic for control sequence
26702675
diagnosticTest(#"\c"#, .unexpectedEndOfInput)
26712676

@@ -2877,6 +2882,11 @@ extension RegexTests {
28772882
diagnosticTest(#"[\d--\u{a b}]"#, .unsupported("scalar sequence in custom character class"))
28782883
diagnosticTest(#"[\d--[\u{a b}]]"#, .unsupported("scalar sequence in custom character class"))
28792884

2885+
diagnosticTest(#"\u12"#, .expectedNumDigits("12", 4))
2886+
diagnosticTest(#"\U12"#, .expectedNumDigits("12", 8))
2887+
diagnosticTest(#"\u{123456789}"#, .numberOverflow("123456789"))
2888+
diagnosticTest(#"\x{123456789}"#, .numberOverflow("123456789"))
2889+
28802890
// MARK: Matching options
28812891

28822892
diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret)

0 commit comments

Comments
 (0)