Skip to content

Commit 61450e8

Browse files
committed
Add lexing heuristic to handle single quotes in re'...'
If a single quote is encountered with a prefix of either `(?`, `(?(`, `\k`, `\g` or `(?C`, continue to scan ahead to a closing `'`. Such prefixes would not be valid endings for a regex literal anyway, and this lets us handle the single quote variant of their syntax. For the group name cases, further refine this skipping behavior by only skipping over characters that could possibly appear in that case. This improves diagnostic behavior by ensuring we don't go wandering off into Swift code.
1 parent 56414b8 commit 61450e8

File tree

2 files changed

+191
-7
lines changed

2 files changed

+191
-7
lines changed

Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,14 @@ fileprivate struct DelimiterLexer {
104104
slice(at: cursor, count)
105105
}
106106

107+
/// Return the slice of `count` bytes preceding the current cursor, or `nil`
108+
/// if there are fewer than `count` bytes before the cursor.
109+
func sliceBehind(_ count: Int) -> UnsafeRawBufferPointer? {
110+
let priorCursor = cursor - count
111+
guard priorCursor >= start else { return nil }
112+
return slice(at: priorCursor, count)
113+
}
114+
107115
/// Advance the cursor `n` bytes.
108116
mutating func advanceCursor(_ n: Int = 1) {
109117
cursor += n
@@ -123,6 +131,86 @@ fileprivate struct DelimiterLexer {
123131
return true
124132
}
125133

134+
/// Attempt to skip over a closing delimiter character that is unlikely to be
135+
/// the actual closing delimiter.
136+
mutating func trySkipDelimiter(_ delimiter: Delimiter) {
137+
// Only the closing `'` for re'...' can potentially be skipped over.
138+
switch delimiter {
139+
case .traditional, .experimental:
140+
return
141+
case .reSingleQuote:
142+
break
143+
}
144+
guard load() == ascii("'") else { return }
145+
146+
/// Need to look for a prefix of `(?`, `(?(`, `\k`, `\g`, `(?C`, as those
147+
/// are the cases that could use single quotes. Note that none of these
148+
/// would be valid regex endings anyway.
149+
let calloutPrefix = "(?C"
150+
let prefix = ["(?", "(?(", #"\k"#, #"\g"#, calloutPrefix].first { prior in
151+
guard let priorSlice = sliceBehind(prior.utf8.count),
152+
priorSlice.elementsEqual(prior.utf8)
153+
else { return false }
154+
155+
// Make sure the slice isn't preceded by a '\', as that invalidates this
156+
// analysis.
157+
if let prior = sliceBehind(priorSlice.count + 1) {
158+
return prior[0] != ascii("\\")
159+
}
160+
return true
161+
}
162+
guard let prefix = prefix else { return }
163+
let isCallout = prefix == calloutPrefix
164+
165+
func isPossiblyGroupReference(_ c: UInt8) -> Bool {
166+
// If this is an ASCII character, make sure it's for a group name. Leave
167+
// other UTF-8 encoded scalars alone, this should at least catch cases
168+
// where we run into a symbol such as `{`, `.`, `;` that would indicate
169+
// we've likely advanced out of the bounds of the regex.
170+
let scalar = UnicodeScalar(c)
171+
guard scalar.isASCII else { return true }
172+
switch scalar {
173+
// Include '-' and '+' which may be used in recursion levels and relative
174+
// references.
175+
case "A"..."Z", "a"..."z", "0"..."9", "_", "-", "+":
176+
return true
177+
default:
178+
return false
179+
}
180+
}
181+
182+
// Make a note of the current lexing position, as we may need to revert
183+
// back to it.
184+
let originalCursor = cursor
185+
advanceCursor()
186+
187+
// Try skip over what would be the contents of a group identifier/reference.
188+
while let next = load() {
189+
// Found the ending, we're done. Return so we can continue to lex to the
190+
// real delimiter.
191+
if next == ascii("'") {
192+
advanceCursor()
193+
return
194+
}
195+
196+
// If this isn't a callout, make sure we have something that could be a
197+
// group reference. We limit the character set here to improve diagnostic
198+
// behavior in the case where the literal is actually unterminated. We
199+
// ideally don't want to go wandering off into Swift source code. We can't
200+
// do the same for callouts, as they take arbitrary strings.
201+
guard isCallout || isPossiblyGroupReference(next) else { break }
202+
do {
203+
try advance()
204+
} catch {
205+
break
206+
}
207+
}
208+
// We bailed out, either because we ran into something that didn't look like
209+
// an identifier, or we reached the end of the line. Revert back to the
210+
// original guess of delimiter.
211+
cursor = originalCursor
212+
}
213+
126214
/// Attempt to eat a particular closing delimiter, returning the contents of
127215
/// the literal, and ending pointer, or `nil` if this is not a delimiter
128216
/// ending.
@@ -194,6 +282,10 @@ fileprivate struct DelimiterLexer {
194282

195283
let contentsStart = cursor
196284
while true {
285+
// Check to see if we're at a character that looks like a delimiter, but
286+
// likely isn't. In such a case, we can attempt to skip over it.
287+
trySkipDelimiter(delimiter)
288+
197289
// Try to lex the closing delimiter.
198290
if let (contents, end) = try tryEatEnding(delimiter,
199291
contentsStart: contentsStart) {

Tests/RegexTests/ParseTests.swift

Lines changed: 99 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -107,28 +107,46 @@ func parseTest(
107107
serializedCaptures.deallocate()
108108
}
109109

110+
/// Test delimiter lexing. Takes an input string that starts with a regex
111+
/// literal. If `ignoreTrailing` is true, there may be additional characters
112+
/// that follow the literal that are not considered part of it.
113+
@discardableResult
110114
func delimiterLexingTest(
111-
_ input: String, file: StaticString = #file, line: UInt = #line
112-
) {
115+
_ input: String, ignoreTrailing: Bool = false,
116+
file: StaticString = #file, line: UInt = #line
117+
) -> String {
113118
input.withCString(encodedAs: UTF8.self) { ptr in
114119
let endPtr = ptr + input.utf8.count
115120
let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr)
116-
XCTAssertEqual(end, endPtr, file: file, line: line)
121+
if ignoreTrailing {
122+
XCTAssertNotEqual(end, endPtr, file: file, line: line)
123+
} else {
124+
XCTAssertEqual(end, endPtr, file: file, line: line)
125+
}
117126

118-
let (parseContents, parseDelim) = droppingRegexDelimiters(input)
127+
let rawPtr = UnsafeRawPointer(ptr)
128+
let buffer = UnsafeRawBufferPointer(start: rawPtr, count: end - rawPtr)
129+
let literal = String(decoding: buffer, as: UTF8.self)
130+
131+
let (parseContents, parseDelim) = droppingRegexDelimiters(literal)
119132
XCTAssertEqual(contents, parseContents, file: file, line: line)
120133
XCTAssertEqual(delim, parseDelim, file: file, line: line)
134+
return literal
121135
}
122136
}
123137

138+
/// Test parsing an input string with regex delimiters. If `ignoreTrailing` is
139+
/// true, there may be additional characters that follow the literal that are
140+
/// not considered part of it.
124141
func parseWithDelimitersTest(
125-
_ input: String, _ expecting: AST.Node,
142+
_ input: String, _ expecting: AST.Node, ignoreTrailing: Bool = false,
126143
file: StaticString = #file, line: UInt = #line
127144
) {
128145
// First try lexing.
129-
delimiterLexingTest(input, file: file, line: line)
146+
let literal = delimiterLexingTest(
147+
input, ignoreTrailing: ignoreTrailing, file: file, line: line)
130148

131-
let orig = try! parseWithDelimiters(input)
149+
let orig = try! parseWithDelimiters(literal)
132150
let ast = orig.root
133151
guard ast == expecting
134152
|| ast._dump() == expecting._dump() // EQ workaround
@@ -1509,6 +1527,63 @@ extension RegexTests {
15091527

15101528
// Printable ASCII characters.
15111529
delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
1530+
1531+
// MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
1532+
// if it's clear that it's part of the regex syntax.
1533+
1534+
parseWithDelimitersTest(
1535+
#"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'"))
1536+
parseWithDelimitersTest(
1537+
#"re'(?'a_bcA0-c1A'x*)'"#,
1538+
balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")))
1539+
1540+
parseWithDelimitersTest(
1541+
#"re'(?('a_bcA0')x|y)'"#, conditional(
1542+
.groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"))
1543+
parseWithDelimitersTest(
1544+
#"re'(?('+20')\')'"#, conditional(
1545+
.groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()))
1546+
1547+
parseWithDelimitersTest(
1548+
#"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))))
1549+
parseWithDelimitersTest(
1550+
#"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1))
1551+
1552+
parseWithDelimitersTest(
1553+
#"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))))
1554+
parseWithDelimitersTest(
1555+
#"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'"))
1556+
1557+
parseWithDelimitersTest(
1558+
#"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(.string(#"a*b\c 🔥_ ;"#)))
1559+
1560+
// Fine, because we don't end up skipping.
1561+
delimiterLexingTest(#"re'(?'"#)
1562+
delimiterLexingTest(#"re'(?('"#)
1563+
delimiterLexingTest(#"re'\k'"#)
1564+
delimiterLexingTest(#"re'\g'"#)
1565+
delimiterLexingTest(#"re'(?C'"#)
1566+
1567+
// Not a valid group name, but we can still skip over it.
1568+
delimiterLexingTest(#"re'(?'🔥')'"#)
1569+
1570+
// Escaped, so don't skip. These will ignore the ending `'` as we've already
1571+
// closed the literal.
1572+
parseWithDelimitersTest(
1573+
#"re'\(?''"#, zeroOrOne(of: "("), ignoreTrailing: true
1574+
)
1575+
parseWithDelimitersTest(
1576+
#"re'\\k''"#, concat("\\", "k"), ignoreTrailing: true
1577+
)
1578+
parseWithDelimitersTest(
1579+
#"re'\\g''"#, concat("\\", "g"), ignoreTrailing: true
1580+
)
1581+
parseWithDelimitersTest(
1582+
#"re'\(?C''"#, concat(zeroOrOne(of: "("), "C"), ignoreTrailing: true
1583+
)
1584+
delimiterLexingTest(#"re'(\?''"#, ignoreTrailing: true)
1585+
delimiterLexingTest(#"re'\(?(''"#, ignoreTrailing: true)
1586+
15121587
// MARK: Parse not-equal
15131588

15141589
// Make sure dumping output correctly reflects differences in AST.
@@ -1815,6 +1890,12 @@ extension RegexTests {
18151890
diagnosticTest(#"(?<#>)"#, .identifierMustBeAlphaNumeric(.groupName))
18161891
diagnosticTest(#"(?'1A')"#, .identifierCannotStartWithNumber(.groupName))
18171892

1893+
// TODO: It might be better if tried to consume up to the closing `'` and
1894+
// diagnosed an invalid group name based on that.
1895+
diagnosticTest(#"(?'abc ')"#, .expected("'"))
1896+
1897+
diagnosticTest("(?'🔥')", .identifierMustBeAlphaNumeric(.groupName))
1898+
18181899
diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName))
18191900
diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName))
18201901
diagnosticTest(#"(?'a-b-c')"#, .expected("'"))
@@ -1928,13 +2009,24 @@ extension RegexTests {
19282009
}
19292010

19302011
func testDelimiterLexingErrors() {
2012+
2013+
// MARK: Printable ASCII
2014+
19312015
delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString)
19322016
for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r.
19332017
delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII)
19342018
}
19352019
delimiterLexingDiagnosticTest("re'\n'", .endOfString)
19362020
delimiterLexingDiagnosticTest("re'\r'", .endOfString)
19372021
delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII)
2022+
2023+
// MARK: Delimiter skipping
2024+
2025+
delimiterLexingDiagnosticTest("re'(?''", .endOfString)
2026+
delimiterLexingDiagnosticTest("re'(?'abc'", .endOfString)
2027+
delimiterLexingDiagnosticTest("re'(?('abc'", .endOfString)
2028+
delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .endOfString)
2029+
delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .endOfString)
19382030
}
19392031

19402032
func testlibswiftDiagnostics() {

0 commit comments

Comments
 (0)