Skip to content

Commit b25afb7

Browse files
committed
Refactor delimiter lexing logic
Introduce a DelimiterLexer type to perform the lexing.
1 parent 2047cb0 commit b25afb7

File tree

1 file changed

+121
-46
lines changed

1 file changed

+121
-46
lines changed

Sources/_MatchingEngine/Regex/Parse/DelimiterLexing.swift

Lines changed: 121 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -63,71 +63,137 @@ struct DelimiterLexError: Error, CustomStringConvertible {
6363
}
6464
}
6565

66-
/// Attempt to lex a regex literal between `start` and `end`, returning either
67-
/// the contents and pointer from which to resume lexing, or an error.
68-
func lexRegex(
69-
start: UnsafeRawPointer, end: UnsafeRawPointer
70-
) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
71-
precondition(start <= end)
72-
var current = start
66+
fileprivate struct DelimiterLexer {
67+
let start: UnsafeRawPointer
68+
var cursor: UnsafeRawPointer
69+
let end: UnsafeRawPointer
70+
71+
init(start: UnsafeRawPointer, end: UnsafeRawPointer) {
72+
precondition(start <= end)
73+
self.start = start
74+
self.cursor = start
75+
self.end = end
76+
}
7377

7478
func ascii(_ s: Unicode.Scalar) -> UInt8 {
7579
assert(s.value <= 0x7F)
7680
return UInt8(asserting: s.value)
7781
}
78-
func load(offset: Int) -> UInt8? {
79-
guard current + offset < end else { return nil }
80-
return current.load(fromByteOffset: offset, as: UInt8.self)
82+
83+
/// Return the byte at the current cursor, or `nil` if the end of the buffer
84+
/// has been reached.
85+
func load() -> UInt8? {
86+
guard cursor < end else { return nil }
87+
return cursor.load(as: UInt8.self)
8188
}
82-
func load() -> UInt8? { load(offset: 0) }
83-
func advance(_ n: Int = 1) {
84-
precondition(current + n <= end, "Cannot advance past end")
85-
current = current.advanced(by: n)
89+
90+
/// Return the slice of `count` bytes from a specified cursor position, or
91+
/// `nil` if there are fewer than `count` bytes until the end of the buffer.
92+
func slice(
93+
at cursor: UnsafeRawPointer, _ count: Int
94+
) -> UnsafeRawBufferPointer? {
95+
guard cursor + count <= end else { return nil }
96+
return UnsafeRawBufferPointer(start: cursor, count: count)
8697
}
8798

88-
func tryEat(_ utf8: String.UTF8View) -> Bool {
89-
for (i, idx) in utf8.indices.enumerated() {
90-
guard load(offset: i) == utf8[idx] else { return false }
91-
}
92-
advance(utf8.count)
99+
/// Return the slice of `count` bytes from the current cursor, or `nil` if
100+
/// there are fewer than `count` bytes until the end of the buffer.
101+
func slice(_ count: Int) -> UnsafeRawBufferPointer? {
102+
slice(at: cursor, count)
103+
}
104+
105+
/// Advance the cursor `n` bytes.
106+
mutating func advanceCursor(_ n: Int = 1) {
107+
cursor += n
108+
precondition(cursor <= end, "Cannot advance past end")
109+
}
110+
111+
/// Check to see if a UTF-8 sequence can be eaten from the current cursor.
112+
func canEat(_ utf8: String.UTF8View) -> Bool {
113+
guard let slice = slice(utf8.count) else { return false }
114+
return slice.elementsEqual(utf8)
115+
}
116+
117+
/// Attempt to eat a UTF-8 byte sequence, returning `true` if successful.
118+
mutating func tryEat(_ utf8: String.UTF8View) -> Bool {
119+
guard canEat(utf8) else { return false }
120+
advanceCursor(utf8.count)
93121
return true
94122
}
95123

96-
// Try to lex the opening delimiter.
97-
guard let delimiter = Delimiter.allCases.first(
98-
where: { tryEat($0.opening.utf8) }
99-
) else {
100-
throw DelimiterLexError(.unknownDelimiter, resumeAt: current.successor())
124+
/// Attempt to eat a particular closing delimiter, returning the contents of
125+
/// the literal, and ending pointer, or `nil` if this is not a delimiter
126+
/// ending.
127+
mutating func tryEatEnding(
128+
_ delimiter: Delimiter, contentsStart: UnsafeRawPointer
129+
) throws -> (contents: String, end: UnsafeRawPointer)? {
130+
let contentsEnd = cursor
131+
guard tryEat(delimiter.closing.utf8) else { return nil }
132+
133+
// Form a string from the contents and make sure it's valid UTF-8.
134+
let count = contentsEnd - contentsStart
135+
let contents = UnsafeRawBufferPointer(
136+
start: contentsStart, count: count)
137+
let s = String(decoding: contents, as: UTF8.self)
138+
139+
guard s.utf8.elementsEqual(contents) else {
140+
throw DelimiterLexError(.invalidUTF8, resumeAt: cursor)
141+
}
142+
return (contents: s, end: cursor)
101143
}
102144

103-
let contentsStart = current
104-
while true {
105-
switch load() {
106-
case nil, ascii("\n"), ascii("\r"):
107-
throw DelimiterLexError(.endOfString, resumeAt: current)
145+
/// Attempt to advance the lexer, throwing an error if the end of a line or
146+
/// the end of the buffer is reached.
147+
mutating func advance(escaped: Bool = false) throws {
148+
guard let next = load() else {
149+
throw DelimiterLexError(.endOfString, resumeAt: cursor)
150+
}
151+
switch UnicodeScalar(next) {
152+
case let next where !next.isASCII:
153+
// Just advance into a UTF-8 sequence. It shouldn't matter that we'll
154+
// iterate through each byte as we only match against ASCII, and we
155+
// validate it at the end. This case is separated out so we can just deal
156+
// with the ASCII cases below.
157+
advanceCursor()
158+
159+
case "\n", "\r":
160+
throw DelimiterLexError(.endOfString, resumeAt: cursor)
161+
162+
case "\0":
163+
// TODO: Warn to match the behavior of String literal lexer? Or should
164+
// we error as unprintable?
165+
advanceCursor()
166+
167+
case "\\" where !escaped:
168+
// Advance again for an escape sequence.
169+
advanceCursor()
170+
try advance(escaped: true)
108171

109-
case ascii("\\"):
110-
// Skip next byte.
111-
advance(2)
112172

113173
default:
114-
// Try to lex the closing delimiter.
115-
let contentsEnd = current
116-
guard tryEat(delimiter.closing.utf8) else {
117-
advance()
118-
continue
119-
}
174+
advanceCursor()
175+
}
176+
}
120177

121-
// Form a string from the contents and make sure it's valid UTF-8.
122-
let count = contentsEnd - contentsStart
123-
let contents = UnsafeRawBufferPointer(
124-
start: contentsStart, count: count)
125-
let s = String(decoding: contents, as: UTF8.self)
178+
/*consuming*/ mutating func lex(
179+
) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
180+
181+
// Try to lex the opening delimiter.
182+
guard let delimiter = Delimiter.allCases.first(
183+
where: { tryEat($0.opening.utf8) }
184+
) else {
185+
throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor())
186+
}
126187

127-
guard s.utf8.elementsEqual(contents) else {
128-
throw DelimiterLexError(.invalidUTF8, resumeAt: current)
188+
let contentsStart = cursor
189+
while true {
190+
// Try to lex the closing delimiter.
191+
if let (contents, end) = try tryEatEnding(delimiter,
192+
contentsStart: contentsStart) {
193+
return (contents, delimiter, end)
129194
}
130-
return (contents: s, delimiter, end: current)
195+
// Try to advance the lexer.
196+
try advance()
131197
}
132198
}
133199
}
@@ -151,3 +217,12 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
151217
}
152218
fatalError("No valid delimiters")
153219
}
220+
221+
/// Attempt to lex a regex literal between `start` and `end`, returning either
222+
/// the contents and pointer from which to resume lexing, or an error.
223+
func lexRegex(
224+
start: UnsafeRawPointer, end: UnsafeRawPointer
225+
) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
226+
var lexer = DelimiterLexer(start: start, end: end)
227+
return try lexer.lex()
228+
}

0 commit comments

Comments
 (0)