Skip to content

Commit b1172a1

Browse files
committed
Add lexing heuristic to handle single quotes in re'...'
If a single quote is encountered with a prefix of either `(?`, `(?(`, `\k`, `\g` or `(?C`, continue to scan ahead to a closing `'`. Such prefixes would not be valid endings for a regex literal anyway, and this lets us handle the single quote variant of their syntax. For the group name cases, further refine this skipping behavior by only skipping over characters that could possibly appear in that case. This improves diagnostic behavior by ensuring we don't go wandering off into Swift code.
1 parent b52d992 commit b1172a1

File tree

2 files changed

+191
-7
lines changed

2 files changed

+191
-7
lines changed

Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,14 @@ fileprivate struct DelimiterLexer {
104104
slice(at: cursor, count)
105105
}
106106

107+
/// Return the slice of `count` bytes preceding the current cursor, or `nil`
108+
/// if there are fewer than `count` bytes before the cursor.
109+
func sliceBehind(_ count: Int) -> UnsafeRawBufferPointer? {
110+
let priorCursor = cursor - count
111+
guard priorCursor >= start else { return nil }
112+
return slice(at: priorCursor, count)
113+
}
114+
107115
/// Advance the cursor `n` bytes.
108116
mutating func advanceCursor(_ n: Int = 1) {
109117
cursor += n
@@ -123,6 +131,86 @@ fileprivate struct DelimiterLexer {
123131
return true
124132
}
125133

134+
/// Attempt to skip over a closing delimiter character that is unlikely to be
135+
/// the actual closing delimiter.
136+
mutating func trySkipDelimiter(_ delimiter: Delimiter) {
137+
// Only the closing `'` for re'...' can potentially be skipped over.
138+
switch delimiter {
139+
case .traditional, .experimental:
140+
return
141+
case .reSingleQuote:
142+
break
143+
}
144+
guard load() == ascii("'") else { return }
145+
146+
/// Need to look for a prefix of `(?`, `(?(`, `\k`, `\g`, `(?C`, as those
147+
/// are the cases that could use single quotes. Note that none of these
148+
/// would be valid regex endings anyway.
149+
let calloutPrefix = "(?C"
150+
let prefix = ["(?", "(?(", #"\k"#, #"\g"#, calloutPrefix].first { prior in
151+
guard let priorSlice = sliceBehind(prior.utf8.count),
152+
priorSlice.elementsEqual(prior.utf8)
153+
else { return false }
154+
155+
// Make sure the slice isn't preceded by a '\', as that invalidates this
156+
// analysis.
157+
if let prior = sliceBehind(priorSlice.count + 1) {
158+
return prior[0] != ascii("\\")
159+
}
160+
return true
161+
}
162+
guard let prefix = prefix else { return }
163+
let isCallout = prefix == calloutPrefix
164+
165+
func isPossiblyGroupReference(_ c: UInt8) -> Bool {
166+
// If this is an ASCII character, make sure it's for a group name. Leave
167+
// other UTF-8 encoded scalars alone, this should at least catch cases
168+
// where we run into a symbol such as `{`, `.`, `;` that would indicate
169+
// we've likely advanced out of the bounds of the regex.
170+
let scalar = UnicodeScalar(c)
171+
guard scalar.isASCII else { return true }
172+
switch scalar {
173+
// Include '-' and '+' which may be used in recursion levels and relative
174+
// references.
175+
case "A"..."Z", "a"..."z", "0"..."9", "_", "-", "+":
176+
return true
177+
default:
178+
return false
179+
}
180+
}
181+
182+
// Make a note of the current lexing position, as we may need to revert
183+
// back to it.
184+
let originalCursor = cursor
185+
advanceCursor()
186+
187+
// Try skip over what would be the contents of a group identifier/reference.
188+
while let next = load() {
189+
// Found the ending, we're done. Return so we can continue to lex to the
190+
// real delimiter.
191+
if next == ascii("'") {
192+
advanceCursor()
193+
return
194+
}
195+
196+
// If this isn't a callout, make sure we have something that could be a
197+
// group reference. We limit the character set here to improve diagnostic
198+
// behavior in the case where the literal is actually unterminated. We
199+
// ideally don't want to go wandering off into Swift source code. We can't
200+
// do the same for callouts, as they take arbitrary strings.
201+
guard isCallout || isPossiblyGroupReference(next) else { break }
202+
do {
203+
try advance()
204+
} catch {
205+
break
206+
}
207+
}
208+
// We bailed out, either because we ran into something that didn't look like
209+
// an identifier, or we reached the end of the line. Revert back to the
210+
// original guess of delimiter.
211+
cursor = originalCursor
212+
}
213+
126214
/// Attempt to eat a particular closing delimiter, returning the contents of
127215
/// the literal, and ending pointer, or `nil` if this is not a delimiter
128216
/// ending.
@@ -194,6 +282,10 @@ fileprivate struct DelimiterLexer {
194282

195283
let contentsStart = cursor
196284
while true {
285+
// Check to see if we're at a character that looks like a delimiter, but
286+
// likely isn't. In such a case, we can attempt to skip over it.
287+
trySkipDelimiter(delimiter)
288+
197289
// Try to lex the closing delimiter.
198290
if let (contents, end) = try tryEatEnding(delimiter,
199291
contentsStart: contentsStart) {

Tests/RegexTests/ParseTests.swift

Lines changed: 99 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -107,28 +107,46 @@ func parseTest(
107107
serializedCaptures.deallocate()
108108
}
109109

110+
/// Test delimiter lexing. Takes an input string that starts with a regex
111+
/// literal. If `ignoreTrailing` is true, there may be additional characters
112+
/// that follow the literal that are not considered part of it.
113+
@discardableResult
110114
func delimiterLexingTest(
111-
_ input: String, file: StaticString = #file, line: UInt = #line
112-
) {
115+
_ input: String, ignoreTrailing: Bool = false,
116+
file: StaticString = #file, line: UInt = #line
117+
) -> String {
113118
input.withCString(encodedAs: UTF8.self) { ptr in
114119
let endPtr = ptr + input.utf8.count
115120
let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr)
116-
XCTAssertEqual(end, endPtr, file: file, line: line)
121+
if ignoreTrailing {
122+
XCTAssertNotEqual(end, endPtr, file: file, line: line)
123+
} else {
124+
XCTAssertEqual(end, endPtr, file: file, line: line)
125+
}
117126

118-
let (parseContents, parseDelim) = droppingRegexDelimiters(input)
127+
let rawPtr = UnsafeRawPointer(ptr)
128+
let buffer = UnsafeRawBufferPointer(start: rawPtr, count: end - rawPtr)
129+
let literal = String(decoding: buffer, as: UTF8.self)
130+
131+
let (parseContents, parseDelim) = droppingRegexDelimiters(literal)
119132
XCTAssertEqual(contents, parseContents, file: file, line: line)
120133
XCTAssertEqual(delim, parseDelim, file: file, line: line)
134+
return literal
121135
}
122136
}
123137

138+
/// Test parsing an input string with regex delimiters. If `ignoreTrailing` is
139+
/// true, there may be additional characters that follow the literal that are
140+
/// not considered part of it.
124141
func parseWithDelimitersTest(
125-
_ input: String, _ expecting: AST.Node,
142+
_ input: String, _ expecting: AST.Node, ignoreTrailing: Bool = false,
126143
file: StaticString = #file, line: UInt = #line
127144
) {
128145
// First try lexing.
129-
delimiterLexingTest(input, file: file, line: line)
146+
let literal = delimiterLexingTest(
147+
input, ignoreTrailing: ignoreTrailing, file: file, line: line)
130148

131-
let orig = try! parseWithDelimiters(input)
149+
let orig = try! parseWithDelimiters(literal)
132150
let ast = orig.root
133151
guard ast == expecting
134152
|| ast._dump() == expecting._dump() // EQ workaround
@@ -1505,6 +1523,63 @@ extension RegexTests {
15051523

15061524
// Printable ASCII characters.
15071525
delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
1526+
1527+
// MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
1528+
// if it's clear that it's part of the regex syntax.
1529+
1530+
parseWithDelimitersTest(
1531+
#"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'"))
1532+
parseWithDelimitersTest(
1533+
#"re'(?'a_bcA0-c1A'x*)'"#,
1534+
balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")))
1535+
1536+
parseWithDelimitersTest(
1537+
#"re'(?('a_bcA0')x|y)'"#, conditional(
1538+
.groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"))
1539+
parseWithDelimitersTest(
1540+
#"re'(?('+20')\')'"#, conditional(
1541+
.groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()))
1542+
1543+
parseWithDelimitersTest(
1544+
#"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))))
1545+
parseWithDelimitersTest(
1546+
#"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1))
1547+
1548+
parseWithDelimitersTest(
1549+
#"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))))
1550+
parseWithDelimitersTest(
1551+
#"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'"))
1552+
1553+
parseWithDelimitersTest(
1554+
#"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(.string(#"a*b\c 🔥_ ;"#)))
1555+
1556+
// Fine, because we don't end up skipping.
1557+
delimiterLexingTest(#"re'(?'"#)
1558+
delimiterLexingTest(#"re'(?('"#)
1559+
delimiterLexingTest(#"re'\k'"#)
1560+
delimiterLexingTest(#"re'\g'"#)
1561+
delimiterLexingTest(#"re'(?C'"#)
1562+
1563+
// Not a valid group name, but we can still skip over it.
1564+
delimiterLexingTest(#"re'(?'🔥')'"#)
1565+
1566+
// Escaped, so don't skip. These will ignore the ending `'` as we've already
1567+
// closed the literal.
1568+
parseWithDelimitersTest(
1569+
#"re'\(?''"#, zeroOrOne(of: "("), ignoreTrailing: true
1570+
)
1571+
parseWithDelimitersTest(
1572+
#"re'\\k''"#, concat("\\", "k"), ignoreTrailing: true
1573+
)
1574+
parseWithDelimitersTest(
1575+
#"re'\\g''"#, concat("\\", "g"), ignoreTrailing: true
1576+
)
1577+
parseWithDelimitersTest(
1578+
#"re'\(?C''"#, concat(zeroOrOne(of: "("), "C"), ignoreTrailing: true
1579+
)
1580+
delimiterLexingTest(#"re'(\?''"#, ignoreTrailing: true)
1581+
delimiterLexingTest(#"re'\(?(''"#, ignoreTrailing: true)
1582+
15081583
// MARK: Parse not-equal
15091584

15101585
// Make sure dumping output correctly reflects differences in AST.
@@ -1811,6 +1886,12 @@ extension RegexTests {
18111886
diagnosticTest(#"(?<#>)"#, .identifierMustBeAlphaNumeric(.groupName))
18121887
diagnosticTest(#"(?'1A')"#, .identifierCannotStartWithNumber(.groupName))
18131888

1889+
// TODO: It might be better if tried to consume up to the closing `'` and
1890+
// diagnosed an invalid group name based on that.
1891+
diagnosticTest(#"(?'abc ')"#, .expected("'"))
1892+
1893+
diagnosticTest("(?'🔥')", .identifierMustBeAlphaNumeric(.groupName))
1894+
18141895
diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName))
18151896
diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName))
18161897
diagnosticTest(#"(?'a-b-c')"#, .expected("'"))
@@ -1924,13 +2005,24 @@ extension RegexTests {
19242005
}
19252006

19262007
func testDelimiterLexingErrors() {
2008+
2009+
// MARK: Printable ASCII
2010+
19272011
delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString)
19282012
for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r.
19292013
delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII)
19302014
}
19312015
delimiterLexingDiagnosticTest("re'\n'", .endOfString)
19322016
delimiterLexingDiagnosticTest("re'\r'", .endOfString)
19332017
delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII)
2018+
2019+
// MARK: Delimiter skipping
2020+
2021+
delimiterLexingDiagnosticTest("re'(?''", .endOfString)
2022+
delimiterLexingDiagnosticTest("re'(?'abc'", .endOfString)
2023+
delimiterLexingDiagnosticTest("re'(?('abc'", .endOfString)
2024+
delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .endOfString)
2025+
delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .endOfString)
19342026
}
19352027

19362028
func testlibswiftDiagnostics() {

0 commit comments

Comments
 (0)