Skip to content

Commit d191d8e

Browse files
authored
Merge pull request #194 from hamishknight/quoted-in-context
2 parents 4dab8d8 + 2325cef commit d191d8e

File tree

7 files changed

+541
-161
lines changed

7 files changed

+541
-161
lines changed
Lines changed: 332 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,332 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
// TODO: mock up multi-line soon
13+
14+
enum Delimiter: Hashable, CaseIterable {
15+
case traditional
16+
case experimental
17+
case reSingleQuote
18+
case rxSingleQuote
19+
20+
var openingAndClosing: (opening: String, closing: String) {
21+
switch self {
22+
case .traditional: return ("#/", "/#")
23+
case .experimental: return ("#|", "|#")
24+
case .reSingleQuote: return ("re'", "'")
25+
case .rxSingleQuote: return ("rx'", "'")
26+
}
27+
}
28+
var opening: String { openingAndClosing.opening }
29+
var closing: String { openingAndClosing.closing }
30+
31+
/// The default set of syntax options that the delimiter indicates.
32+
var defaultSyntaxOptions: SyntaxOptions {
33+
switch self {
34+
case .traditional, .reSingleQuote:
35+
return .traditional
36+
case .experimental, .rxSingleQuote:
37+
return .experimental
38+
}
39+
}
40+
}
41+
42+
struct DelimiterLexError: Error, CustomStringConvertible {
43+
enum Kind: Hashable {
44+
case endOfString
45+
case invalidUTF8 // TODO: better range reporting
46+
case unknownDelimiter
47+
case unprintableASCII
48+
}
49+
50+
var kind: Kind
51+
52+
/// The pointer at which to resume lexing.
53+
var resumePtr: UnsafeRawPointer
54+
55+
init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) {
56+
self.kind = kind
57+
self.resumePtr = resumePtr
58+
}
59+
60+
var description: String {
61+
switch kind {
62+
case .endOfString: return "unterminated regex literal"
63+
case .invalidUTF8: return "invalid UTF-8 found in source file"
64+
case .unknownDelimiter: return "unknown regex literal delimiter"
65+
case .unprintableASCII: return "unprintable ASCII character found in source file"
66+
}
67+
}
68+
}
69+
70+
fileprivate struct DelimiterLexer {
71+
let start: UnsafeRawPointer
72+
var cursor: UnsafeRawPointer
73+
let end: UnsafeRawPointer
74+
75+
init(start: UnsafeRawPointer, end: UnsafeRawPointer) {
76+
precondition(start <= end)
77+
self.start = start
78+
self.cursor = start
79+
self.end = end
80+
}
81+
82+
func ascii(_ s: Unicode.Scalar) -> UInt8 {
83+
assert(s.value <= 0x7F)
84+
return UInt8(asserting: s.value)
85+
}
86+
87+
/// Return the byte at the current cursor, or `nil` if the end of the buffer
88+
/// has been reached.
89+
func load() -> UInt8? {
90+
guard cursor < end else { return nil }
91+
return cursor.load(as: UInt8.self)
92+
}
93+
94+
/// Return the slice of `count` bytes from a specified cursor position, or
95+
/// `nil` if there are fewer than `count` bytes until the end of the buffer.
96+
func slice(
97+
at cursor: UnsafeRawPointer, _ count: Int
98+
) -> UnsafeRawBufferPointer? {
99+
guard cursor + count <= end else { return nil }
100+
return UnsafeRawBufferPointer(start: cursor, count: count)
101+
}
102+
103+
/// Return the slice of `count` bytes from the current cursor, or `nil` if
104+
/// there are fewer than `count` bytes until the end of the buffer.
105+
func slice(_ count: Int) -> UnsafeRawBufferPointer? {
106+
slice(at: cursor, count)
107+
}
108+
109+
/// Return the slice of `count` bytes preceding the current cursor, or `nil`
110+
/// if there are fewer than `count` bytes before the cursor.
111+
func sliceBehind(_ count: Int) -> UnsafeRawBufferPointer? {
112+
let priorCursor = cursor - count
113+
guard priorCursor >= start else { return nil }
114+
return slice(at: priorCursor, count)
115+
}
116+
117+
/// Advance the cursor `n` bytes.
118+
mutating func advanceCursor(_ n: Int = 1) {
119+
cursor += n
120+
precondition(cursor <= end, "Cannot advance past end")
121+
}
122+
123+
/// Check to see if a UTF-8 sequence can be eaten from the current cursor.
124+
func canEat(_ utf8: String.UTF8View) -> Bool {
125+
guard let slice = slice(utf8.count) else { return false }
126+
return slice.elementsEqual(utf8)
127+
}
128+
129+
/// Attempt to eat a UTF-8 byte sequence, returning `true` if successful.
130+
mutating func tryEat(_ utf8: String.UTF8View) -> Bool {
131+
guard canEat(utf8) else { return false }
132+
advanceCursor(utf8.count)
133+
return true
134+
}
135+
136+
/// Attempt to skip over a closing delimiter character that is unlikely to be
137+
/// the actual closing delimiter.
138+
mutating func trySkipDelimiter(_ delimiter: Delimiter) {
139+
// Only the closing `'` for re'...'/rx'...' can potentially be skipped over.
140+
switch delimiter {
141+
case .traditional, .experimental:
142+
return
143+
case .reSingleQuote, .rxSingleQuote:
144+
break
145+
}
146+
guard load() == ascii("'") else { return }
147+
148+
/// Need to look for a prefix of `(?`, `(?(`, `\k`, `\g`, `(?C`, as those
149+
/// are the cases that could use single quotes. Note that none of these
150+
/// would be valid regex endings anyway.
151+
let calloutPrefix = "(?C"
152+
let prefix = ["(?", "(?(", #"\k"#, #"\g"#, calloutPrefix].first { prior in
153+
guard let priorSlice = sliceBehind(prior.utf8.count),
154+
priorSlice.elementsEqual(prior.utf8)
155+
else { return false }
156+
157+
// Make sure the slice isn't preceded by a '\', as that invalidates this
158+
// analysis.
159+
if let prior = sliceBehind(priorSlice.count + 1) {
160+
return prior[0] != ascii("\\")
161+
}
162+
return true
163+
}
164+
guard let prefix = prefix else { return }
165+
let isCallout = prefix == calloutPrefix
166+
167+
func isPossiblyGroupReference(_ c: UInt8) -> Bool {
168+
// If this is an ASCII character, make sure it's for a group name. Leave
169+
// other UTF-8 encoded scalars alone, this should at least catch cases
170+
// where we run into a symbol such as `{`, `.`, `;` that would indicate
171+
// we've likely advanced out of the bounds of the regex.
172+
let scalar = UnicodeScalar(c)
173+
guard scalar.isASCII else { return true }
174+
switch scalar {
175+
// Include '-' and '+' which may be used in recursion levels and relative
176+
// references.
177+
case "A"..."Z", "a"..."z", "0"..."9", "_", "-", "+":
178+
return true
179+
default:
180+
return false
181+
}
182+
}
183+
184+
// Make a note of the current lexing position, as we may need to revert
185+
// back to it.
186+
let originalCursor = cursor
187+
advanceCursor()
188+
189+
// Try skip over what would be the contents of a group identifier/reference.
190+
while let next = load() {
191+
// Found the ending, we're done. Return so we can continue to lex to the
192+
// real delimiter.
193+
if next == ascii("'") {
194+
advanceCursor()
195+
return
196+
}
197+
198+
// If this isn't a callout, make sure we have something that could be a
199+
// group reference. We limit the character set here to improve diagnostic
200+
// behavior in the case where the literal is actually unterminated. We
201+
// ideally don't want to go wandering off into Swift source code. We can't
202+
// do the same for callouts, as they take arbitrary strings.
203+
guard isCallout || isPossiblyGroupReference(next) else { break }
204+
do {
205+
try advance()
206+
} catch {
207+
break
208+
}
209+
}
210+
// We bailed out, either because we ran into something that didn't look like
211+
// an identifier, or we reached the end of the line. Revert back to the
212+
// original guess of delimiter.
213+
cursor = originalCursor
214+
}
215+
216+
/// Attempt to eat a particular closing delimiter, returning the contents of
217+
/// the literal, and ending pointer, or `nil` if this is not a delimiter
218+
/// ending.
219+
mutating func tryEatEnding(
220+
_ delimiter: Delimiter, contentsStart: UnsafeRawPointer
221+
) throws -> (contents: String, end: UnsafeRawPointer)? {
222+
let contentsEnd = cursor
223+
guard tryEat(delimiter.closing.utf8) else { return nil }
224+
225+
// Form a string from the contents and make sure it's valid UTF-8.
226+
let count = contentsEnd - contentsStart
227+
let contents = UnsafeRawBufferPointer(
228+
start: contentsStart, count: count)
229+
let s = String(decoding: contents, as: UTF8.self)
230+
231+
guard s.utf8.elementsEqual(contents) else {
232+
throw DelimiterLexError(.invalidUTF8, resumeAt: cursor)
233+
}
234+
return (contents: s, end: cursor)
235+
}
236+
237+
/// Attempt to advance the lexer, throwing an error if the end of a line or
238+
/// the end of the buffer is reached.
239+
mutating func advance(escaped: Bool = false) throws {
240+
guard let next = load() else {
241+
throw DelimiterLexError(.endOfString, resumeAt: cursor)
242+
}
243+
switch UnicodeScalar(next) {
244+
case let next where !next.isASCII:
245+
// Just advance into a UTF-8 sequence. It shouldn't matter that we'll
246+
// iterate through each byte as we only match against ASCII, and we
247+
// validate it at the end. This case is separated out so we can just deal
248+
// with the ASCII cases below.
249+
advanceCursor()
250+
251+
case "\n", "\r":
252+
throw DelimiterLexError(.endOfString, resumeAt: cursor)
253+
254+
case "\0":
255+
// TODO: Warn to match the behavior of String literal lexer? Or should
256+
// we error as unprintable?
257+
advanceCursor()
258+
259+
case "\\" where !escaped:
260+
// Advance again for an escape sequence.
261+
advanceCursor()
262+
try advance(escaped: true)
263+
264+
case let next where !next.isPrintableASCII:
265+
// Diagnose unprintable ASCII.
266+
// TODO: Ideally we would recover and continue to lex until the ending
267+
// delimiter.
268+
throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor())
269+
270+
default:
271+
advanceCursor()
272+
}
273+
}
274+
275+
/*consuming*/ mutating func lex(
276+
) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
277+
278+
// Try to lex the opening delimiter.
279+
guard let delimiter = Delimiter.allCases.first(
280+
where: { tryEat($0.opening.utf8) }
281+
) else {
282+
throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor())
283+
}
284+
285+
let contentsStart = cursor
286+
while true {
287+
// Check to see if we're at a character that looks like a delimiter, but
288+
// likely isn't. In such a case, we can attempt to skip over it.
289+
trySkipDelimiter(delimiter)
290+
291+
// Try to lex the closing delimiter.
292+
if let (contents, end) = try tryEatEnding(delimiter,
293+
contentsStart: contentsStart) {
294+
return (contents, delimiter, end)
295+
}
296+
// Try to advance the lexer.
297+
try advance()
298+
}
299+
}
300+
}
301+
302+
/// Drop a set of regex delimiters from the input string, returning the contents
303+
/// and the delimiter used. The input string must have valid delimiters.
304+
func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
305+
func stripDelimiter(_ delim: Delimiter) -> String? {
306+
// The opening delimiter must match.
307+
guard var slice = str.utf8.tryDropPrefix(delim.opening.utf8)
308+
else { return nil }
309+
310+
// The closing delimiter may optionally match, as it may not be present in
311+
// invalid code.
312+
if let newSlice = slice.tryDropSuffix(delim.closing.utf8) {
313+
slice = newSlice
314+
}
315+
return String(slice)
316+
}
317+
for d in Delimiter.allCases {
318+
if let contents = stripDelimiter(d) {
319+
return (contents, d)
320+
}
321+
}
322+
fatalError("No valid delimiters")
323+
}
324+
325+
/// Attempt to lex a regex literal between `start` and `end`, returning either
326+
/// the contents and pointer from which to resume lexing, or an error.
327+
func lexRegex(
328+
start: UnsafeRawPointer, end: UnsafeRawPointer
329+
) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
330+
var lexer = DelimiterLexer(start: start, end: end)
331+
return try lexer.lex()
332+
}

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1472,7 +1472,9 @@ extension Source {
14721472
return ref
14731473
}
14741474

1475-
let char = src.eat()
1475+
guard let char = src.tryEat() else {
1476+
throw ParseError.expectedEscape
1477+
}
14761478

14771479
// Single-character builtins.
14781480
if let builtin = AST.Atom.EscapedBuiltin(

0 commit comments

Comments
 (0)