Skip to content

Commit 8b3e2ef

Browse files
committed
Diagnose unprintable ASCII characters
This matches the behavior of the C++ lexer for string literals.
1 parent 7e82082 commit 8b3e2ef

File tree

3 files changed

+70
-8
lines changed

3 files changed

+70
-8
lines changed

Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ struct DelimiterLexError: Error, CustomStringConvertible {
4242
case endOfString
4343
case invalidUTF8 // TODO: better range reporting
4444
case unknownDelimiter
45+
case unprintableASCII
4546
}
4647

4748
var kind: Kind
@@ -59,6 +60,7 @@ struct DelimiterLexError: Error, CustomStringConvertible {
5960
case .endOfString: return "unterminated regex literal"
6061
case .invalidUTF8: return "invalid UTF-8 found in source file"
6162
case .unknownDelimiter: return "unknown regex literal delimiter"
63+
case .unprintableASCII: return "unprintable ASCII character found in source file"
6264
}
6365
}
6466
}
@@ -169,6 +171,11 @@ fileprivate struct DelimiterLexer {
169171
advanceCursor()
170172
try advance(escaped: true)
171173

174+
case let next where !next.isPrintableASCII:
175+
// Diagnose unprintable ASCII.
176+
// TODO: Ideally we would recover and continue to lex until the ending
177+
// delimiter.
178+
throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor())
172179

173180
default:
174181
advanceCursor()

Sources/_MatchingEngine/Utility/MissingUnicode.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -661,3 +661,11 @@ extension Character {
661661

662662
public var isWordCharacter: Bool { isLetter || isNumber || self == "_" }
663663
}
664+
665+
extension UnicodeScalar {
666+
public var isPrintableASCII: Bool {
667+
// Exclude non-printables before the space character U+20, and anything
668+
// including and above the DEL character U+7F.
669+
value >= 0x20 && value < 0x7F
670+
}
671+
}

Tests/RegexTests/ParseTests.swift

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -107,20 +107,26 @@ func parseTest(
107107
serializedCaptures.deallocate()
108108
}
109109

110-
func parseWithDelimitersTest(
111-
_ input: String, _ expecting: AST.Node,
112-
file: StaticString = #file, line: UInt = #line
110+
func delimiterLexingTest(
111+
_ input: String, file: StaticString = #file, line: UInt = #line
113112
) {
114-
// First try lexing.
115-
input.withCString { ptr in
116-
let (contents, delim, end) = try! lexRegex(start: ptr,
117-
end: ptr + input.count)
118-
XCTAssertEqual(end, ptr + input.count, file: file, line: line)
113+
input.withCString(encodedAs: UTF8.self) { ptr in
114+
let endPtr = ptr + input.utf8.count
115+
let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr)
116+
XCTAssertEqual(end, endPtr, file: file, line: line)
119117

120118
let (parseContents, parseDelim) = droppingRegexDelimiters(input)
121119
XCTAssertEqual(contents, parseContents, file: file, line: line)
122120
XCTAssertEqual(delim, parseDelim, file: file, line: line)
123121
}
122+
}
123+
124+
func parseWithDelimitersTest(
125+
_ input: String, _ expecting: AST.Node,
126+
file: StaticString = #file, line: UInt = #line
127+
) {
128+
// First try lexing.
129+
delimiterLexingTest(input, file: file, line: line)
124130

125131
let orig = try! parseWithDelimiters(input)
126132
let ast = orig.root
@@ -199,6 +205,32 @@ func diagnosticTest(
199205
}
200206
}
201207

208+
func delimiterLexingDiagnosticTest(
209+
_ input: String, _ expected: DelimiterLexError.Kind,
210+
syntax: SyntaxOptions = .traditional,
211+
file: StaticString = #file, line: UInt = #line
212+
) {
213+
do {
214+
_ = try input.withCString { ptr in
215+
try lexRegex(start: ptr, end: ptr + input.count)
216+
}
217+
XCTFail("""
218+
Passed, but expected error: \(expected)
219+
""", file: file, line: line)
220+
} catch let e as DelimiterLexError {
221+
guard e.kind == expected else {
222+
XCTFail("""
223+
224+
Expected: \(expected)
225+
Actual: \(e.kind)
226+
""", file: file, line: line)
227+
return
228+
}
229+
} catch let e {
230+
XCTFail("Unexpected error type: \(e)", file: file, line: line)
231+
}
232+
}
233+
202234
func libswiftDiagnosticMessageTest(
203235
_ input: String, _ expectedErr: String, file: StaticString = #file,
204236
line: UInt = #line
@@ -1472,6 +1504,11 @@ extension RegexTests {
14721504

14731505
parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x"))
14741506

1507+
parseWithDelimitersTest(#"re'🔥🇩🇰'"#, concat("🔥", "🇩🇰"))
1508+
parseWithDelimitersTest(#"re'\🔥✅'"#, concat("🔥", ""))
1509+
1510+
// Printable ASCII characters.
1511+
delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
14751512
// MARK: Parse not-equal
14761513

14771514
// Make sure dumping output correctly reflects differences in AST.
@@ -1890,6 +1927,16 @@ extension RegexTests {
18901927
diagnosticTest("(*LIMIT_DEPTH=-1", .expectedNumber("", kind: .decimal))
18911928
}
18921929

1930+
func testDelimiterLexingErrors() {
1931+
delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString)
1932+
for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r.
1933+
delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII)
1934+
}
1935+
delimiterLexingDiagnosticTest("re'\n'", .endOfString)
1936+
delimiterLexingDiagnosticTest("re'\r'", .endOfString)
1937+
delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII)
1938+
}
1939+
18931940
func testlibswiftDiagnostics() {
18941941
libswiftDiagnosticMessageTest(
18951942
"#/[x*/#", "cannot parse regular expression: expected ']'")

0 commit comments

Comments
 (0)