Skip to content

Commit 9cf3cfc

Browse files
authored
Merge pull request #393 from hamishknight/stricter-syntax
2 parents 489c63c + db58c1b commit 9cf3cfc

File tree

12 files changed

+145
-20
lines changed

12 files changed

+145
-20
lines changed

Sources/_RegexParser/Regex/AST/AST.swift

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ extension AST {
5252
/// Comments, non-semantic whitespace, etc
5353
case trivia(Trivia)
5454

55+
/// Intepolation `<{...}>`, currently reserved for future use.
56+
case interpolation(Interpolation)
57+
5558
case atom(Atom)
5659

5760
case customCharacterClass(CustomCharacterClass)
@@ -77,6 +80,7 @@ extension AST.Node {
7780
case let .quantification(v): return v
7881
case let .quote(v): return v
7982
case let .trivia(v): return v
83+
case let .interpolation(v): return v
8084
case let .atom(v): return v
8185
case let .customCharacterClass(v): return v
8286
case let .empty(v): return v
@@ -129,7 +133,7 @@ extension AST.Node {
129133
case .conditional, .customCharacterClass, .absentFunction:
130134
return true
131135
case .alternation, .concatenation, .quantification, .quote, .trivia,
132-
.empty:
136+
.empty, .interpolation:
133137
return false
134138
}
135139
}
@@ -193,6 +197,16 @@ extension AST {
193197
}
194198
}
195199

200+
public struct Interpolation: Hashable, _ASTNode {
201+
public let contents: String
202+
public let location: SourceLocation
203+
204+
public init(_ contents: String, _ location: SourceLocation) {
205+
self.contents = contents
206+
self.location = location
207+
}
208+
}
209+
196210
public struct Empty: Hashable, _ASTNode {
197211
public let location: SourceLocation
198212

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -819,7 +819,7 @@ extension AST.Node {
819819
case .alternation, .concatenation, .group,
820820
.conditional, .quantification, .quote,
821821
.trivia, .customCharacterClass, .empty,
822-
.absentFunction:
822+
.absentFunction, .interpolation:
823823
return nil
824824
}
825825
}

Sources/_RegexParser/Regex/Parse/CaptureList.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ extension AST.Node {
103103
break
104104
}
105105

106-
case .quote, .trivia, .atom, .customCharacterClass, .empty:
106+
case .quote, .trivia, .atom, .customCharacterClass, .empty, .interpolation:
107107
break
108108
}
109109
}

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ enum ParseError: Error, Hashable {
4242
case expectedNonEmptyContents
4343
case expectedEscape
4444
case invalidEscape(Character)
45+
case confusableCharacter(Character)
4546

4647
case cannotReferToWholePattern
4748

@@ -128,6 +129,8 @@ extension ParseError: CustomStringConvertible {
128129
return "expected escape sequence"
129130
case .invalidEscape(let c):
130131
return "invalid escape sequence '\\\(c)'"
132+
case .confusableCharacter(let c):
133+
return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead"
131134
case .cannotReferToWholePattern:
132135
return "cannot refer to whole pattern here"
133136
case .quantifierRequiresOperand(let q):

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -589,6 +589,26 @@ extension Source {
589589
return AST.Quote(str.value, str.location)
590590
}
591591

592+
/// Try to consume an interpolation sequence.
593+
///
594+
/// Interpolation -> '<{' String '}>'
595+
///
596+
mutating func lexInterpolation() throws -> AST.Interpolation? {
597+
let contents = try recordLoc { src -> String? in
598+
try src.tryEating { src in
599+
guard src.tryEat(sequence: "<{") else { return nil }
600+
_ = src.lexUntil { $0.isEmpty || $0.starts(with: "}>") }
601+
guard src.tryEat(sequence: "}>") else { return nil }
602+
603+
// Not currently supported. We error here instead of during Sema to
604+
// get a better error for something like `(<{)}>`.
605+
throw ParseError.unsupported("interpolation")
606+
}
607+
}
608+
guard let contents = contents else { return nil }
609+
return .init(contents.value, contents.location)
610+
}
611+
592612
/// Try to consume a comment
593613
///
594614
/// Comment -> '(?#' [^')']* ')'
@@ -1674,9 +1694,10 @@ extension Source {
16741694
break
16751695
}
16761696

1677-
// We only allow unknown escape sequences for non-letter ASCII, and
1678-
// non-ASCII whitespace.
1679-
guard (char.isASCII && !char.isLetter) ||
1697+
// We only allow unknown escape sequences for non-letter non-number ASCII,
1698+
// and non-ASCII whitespace.
1699+
// TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`.
1700+
guard (char.isASCII && !char.isLetter && !char.isNumber) ||
16801701
(!char.isASCII && char.isWhitespace)
16811702
else {
16821703
throw ParseError.invalidEscape(char)
@@ -1981,10 +2002,21 @@ extension Source {
19812002

19822003
case "]":
19832004
assert(!customCC, "parser should have prevented this")
1984-
fallthrough
2005+
break
19852006

1986-
default: return .char(char)
2007+
default:
2008+
// Reject non-letter non-number non-`\r\n` ASCII characters that have
2009+
// multiple scalars. These may be confusable for metacharacters, e.g
2010+
// `[\u{301}]` wouldn't be interpreted as a custom character class due
2011+
// to the combining accent (assuming it is literal, not `\u{...}`).
2012+
let scalars = char.unicodeScalars
2013+
if scalars.count > 1 && scalars.first!.isASCII && char != "\r\n" &&
2014+
!char.isLetter && !char.isNumber {
2015+
throw ParseError.confusableCharacter(char)
2016+
}
2017+
break
19872018
}
2019+
return .char(char)
19882020
}
19892021
guard let kind = kind else { return nil }
19902022
return AST.Atom(kind.value, kind.location)

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,13 @@ extension Parser {
222222
result.append(.quote(quote))
223223
continue
224224
}
225+
226+
// Interpolation -> `lexInterpolation`
227+
if let interpolation = try source.lexInterpolation() {
228+
result.append(.interpolation(interpolation))
229+
continue
230+
}
231+
225232
// Quantification -> QuantOperand Quantifier?
226233
if let operand = try parseQuantifierOperand() {
227234
if let (amt, kind, trivia) =

Sources/_RegexParser/Regex/Parse/Sema.swift

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,11 @@ extension RegexValidator {
395395
// These are Oniguruma specific.
396396
throw error(.unsupported("absent function"), at: a.location)
397397

398+
case .interpolation(let i):
399+
// This is currently rejected in the parser for better diagnostics, but
400+
// reject here too until we get runtime support.
401+
throw error(.unsupported("interpolation"), at: i.location)
402+
398403
case .quote, .trivia, .empty:
399404
break
400405
}

Sources/_RegexParser/Regex/Printing/DumpAST.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,10 @@ extension AST.Trivia {
101101
}
102102
}
103103

104+
extension AST.Interpolation {
105+
public var _dumpBase: String { "interpolation <\(contents)>" }
106+
}
107+
104108
extension AST.Empty {
105109
public var _dumpBase: String { "" }
106110
}

Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ extension PrettyPrinter {
9797
case let .trivia(t):
9898
output(t._canonicalBase)
9999

100+
case let .interpolation(i):
101+
output(i._canonicalBase)
102+
100103
case let .atom(a):
101104
output(a._canonicalBase)
102105

@@ -178,6 +181,12 @@ extension AST.Quote {
178181
}
179182
}
180183

184+
extension AST.Interpolation {
185+
var _canonicalBase: String {
186+
"<{\(contents)}>"
187+
}
188+
}
189+
181190
extension AST.Group.Kind {
182191
var _canonicalBase: String {
183192
switch self {

Sources/_StringProcessing/Regex/ASTConversion.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,9 @@ extension AST.Node {
137137
case let .trivia(v):
138138
return .trivia(v.contents)
139139

140+
case .interpolation:
141+
throw Unsupported("TODO: interpolation")
142+
140143
case let .atom(v):
141144
switch v.kind {
142145
case .scalarSequence(let seq):

Tests/RegexTests/MatchTests.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,12 @@ extension RegexTests {
218218
firstMatchTest(
219219
#"abc\d"#, input: "xyzabc123", match: "abc1")
220220

221+
// MARK: Allowed combining characters
222+
223+
firstMatchTest("e\u{301}", input: "e\u{301}", match: "e\u{301}")
224+
firstMatchTest("1\u{358}", input: "1\u{358}", match: "1\u{358}")
225+
firstMatchTest(#"\ \#u{361}"#, input: " \u{361}", match: " \u{361}")
226+
221227
// MARK: Alternations
222228

223229
firstMatchTest(

Tests/RegexTests/ParseTests.swift

Lines changed: 54 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,12 @@ extension RegexTests {
394394
#"abc\d"#,
395395
concat("a", "b", "c", escaped(.decimalDigit)))
396396

397+
// MARK: Allowed combining characters
398+
399+
parseTest("e\u{301}", "e\u{301}")
400+
parseTest("1\u{358}", "1\u{358}")
401+
parseTest(#"\ \#u{361}"#, " \u{361}")
402+
397403
// MARK: Alternations
398404

399405
parseTest(
@@ -466,14 +472,6 @@ extension RegexTests {
466472
parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8"))
467473
parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}")))
468474

469-
// TODO: These are treated as octal sequences by PCRE, we should warn and
470-
// suggest user prefix with 0.
471-
parseTest(#"[\1]"#, charClass("1"))
472-
parseTest(#"[\123]"#, charClass("1", "2", "3"))
473-
parseTest(#"[\101]"#, charClass("1", "0", "1"))
474-
parseTest(#"[\7777]"#, charClass("7", "7", "7", "7"))
475-
parseTest(#"[\181]"#, charClass("1", "8", "1"))
476-
477475
// We take *up to* the first two valid digits for \x. No valid digits is 0.
478476
parseTest(#"\x"#, scalar("\u{0}"))
479477
parseTest(#"\x5"#, scalar("\u{5}"))
@@ -484,6 +482,8 @@ extension RegexTests {
484482
parseTest(#"\u{ a }"#, scalar("\u{A}"))
485483
parseTest(#"\u{ a }\u{ B }"#, concat(scalar("\u{A}"), scalar("\u{B}")))
486484

485+
parseTest(#"[\u{301}]"#, charClass(scalar_m("\u{301}")))
486+
487487
// MARK: Scalar sequences
488488

489489
parseTest(#"\u{A bC}"#, scalarSeq("\u{A}", "\u{BC}"))
@@ -788,6 +788,20 @@ extension RegexTests {
788788
#"a(?#. comment)b"#,
789789
concat("a", "b"))
790790

791+
// MARK: Interpolation
792+
793+
// These are literal as there's no closing '}>'
794+
parseTest("<{", concat("<", "{"))
795+
parseTest("<{a", concat("<", "{", "a"))
796+
parseTest("<{a}", concat("<", "{", "a", "}"))
797+
parseTest("<{<{}", concat("<", "{", "<", "{", "}"))
798+
799+
// Literal as escaped
800+
parseTest(#"\<{}>"#, concat("<", "{", "}", ">"))
801+
802+
// A quantification
803+
parseTest(#"<{2}"#, exactly(2, of: "<"))
804+
791805
// MARK: Quantification
792806

793807
parseTest("a*", zeroOrMore(of: "a"))
@@ -1251,10 +1265,6 @@ extension RegexTests {
12511265
parseTest(#"\g'+30'"#, subpattern(.relative(30)), throwsError: .unsupported)
12521266
parseTest(#"\g'abc'"#, subpattern(.named("abc")), throwsError: .unsupported)
12531267

1254-
// Backreferences are not valid in custom character classes.
1255-
parseTest(#"[\8]"#, charClass("8"))
1256-
parseTest(#"[\9]"#, charClass("9"))
1257-
12581268
// These are valid references.
12591269
parseTest(#"()\1"#, concat(
12601270
capture(empty()), backreference(.absolute(1))
@@ -2536,6 +2546,17 @@ extension RegexTests {
25362546
// TODO: Custom diagnostic for missing '\Q'
25372547
diagnosticTest(#"\E"#, .invalidEscape("E"))
25382548

2549+
// PCRE treats these as octal, but we require a `0` prefix.
2550+
diagnosticTest(#"[\1]"#, .invalidEscape("1"))
2551+
diagnosticTest(#"[\123]"#, .invalidEscape("1"))
2552+
diagnosticTest(#"[\101]"#, .invalidEscape("1"))
2553+
diagnosticTest(#"[\7777]"#, .invalidEscape("7"))
2554+
diagnosticTest(#"[\181]"#, .invalidEscape("1"))
2555+
2556+
// Backreferences are not valid in custom character classes.
2557+
diagnosticTest(#"[\8]"#, .invalidEscape("8"))
2558+
diagnosticTest(#"[\9]"#, .invalidEscape("9"))
2559+
25392560
// Non-ASCII non-whitespace cases.
25402561
diagnosticTest(#"\🔥"#, .invalidEscape("🔥"))
25412562
diagnosticTest(#"\🇩🇰"#, .invalidEscape("🇩🇰"))
@@ -2544,6 +2565,27 @@ extension RegexTests {
25442565
diagnosticTest(#"\˂"#, .invalidEscape("˂"))
25452566
diagnosticTest(#"\d\#u{301}"#, .invalidEscape("d\u{301}"))
25462567

2568+
// MARK: Confusable characters
2569+
2570+
diagnosticTest("[\u{301}]", .confusableCharacter("[\u{301}"))
2571+
diagnosticTest("(\u{358})", .confusableCharacter("(\u{358}"))
2572+
diagnosticTest("{\u{35B}}", .confusableCharacter("{\u{35B}"))
2573+
diagnosticTest(#"\\#u{35C}"#, .confusableCharacter(#"\\#u{35C}"#))
2574+
diagnosticTest("^\u{35D}", .confusableCharacter("^\u{35D}"))
2575+
diagnosticTest("$\u{35E}", .confusableCharacter("$\u{35E}"))
2576+
diagnosticTest(".\u{35F}", .confusableCharacter(".\u{35F}"))
2577+
diagnosticTest("|\u{360}", .confusableCharacter("|\u{360}"))
2578+
diagnosticTest(" \u{361}", .confusableCharacter(" \u{361}"))
2579+
2580+
// MARK: Interpolation (currently unsupported)
2581+
2582+
diagnosticTest("<{}>", .unsupported("interpolation"))
2583+
diagnosticTest("<{...}>", .unsupported("interpolation"))
2584+
diagnosticTest("<{)}>", .unsupported("interpolation"))
2585+
diagnosticTest("<{}}>", .unsupported("interpolation"))
2586+
diagnosticTest("<{<{}>", .unsupported("interpolation"))
2587+
diagnosticTest("(<{)}>", .unsupported("interpolation"))
2588+
25472589
// MARK: Character properties
25482590

25492591
diagnosticTest(#"\p{Lx}"#, .unknownProperty(key: nil, value: "Lx"))

0 commit comments

Comments
 (0)