@@ -63,71 +63,137 @@ struct DelimiterLexError: Error, CustomStringConvertible {
63
63
}
64
64
}
65
65
66
- /// Attempt to lex a regex literal between `start` and `end`, returning either
67
- /// the contents and pointer from which to resume lexing, or an error.
68
- func lexRegex(
69
- start: UnsafeRawPointer , end: UnsafeRawPointer
70
- ) throws -> ( contents: String , Delimiter , end: UnsafeRawPointer ) {
71
- precondition ( start <= end)
72
- var current = start
66
+ fileprivate struct DelimiterLexer {
67
+ let start : UnsafeRawPointer
68
+ var cursor : UnsafeRawPointer
69
+ let end : UnsafeRawPointer
70
+
71
+ init ( start: UnsafeRawPointer , end: UnsafeRawPointer ) {
72
+ precondition ( start <= end)
73
+ self . start = start
74
+ self . cursor = start
75
+ self . end = end
76
+ }
73
77
74
78
func ascii( _ s: Unicode . Scalar ) -> UInt8 {
75
79
assert ( s. value <= 0x7F )
76
80
return UInt8 ( asserting: s. value)
77
81
}
78
- func load( offset: Int ) -> UInt8 ? {
79
- guard current + offset < end else { return nil }
80
- return current. load ( fromByteOffset: offset, as: UInt8 . self)
82
+
83
+ /// Return the byte at the current cursor, or `nil` if the end of the buffer
84
+ /// has been reached.
85
+ func load( ) -> UInt8 ? {
86
+ guard cursor < end else { return nil }
87
+ return cursor. load ( as: UInt8 . self)
81
88
}
82
- func load( ) -> UInt8 ? { load ( offset: 0 ) }
83
- func advance( _ n: Int = 1 ) {
84
- precondition ( current + n <= end, " Cannot advance past end " )
85
- current = current. advanced ( by: n)
89
+
90
+ /// Return the slice of `count` bytes from a specified cursor position, or
91
+ /// `nil` if there are fewer than `count` bytes until the end of the buffer.
92
+ func slice(
93
+ at cursor: UnsafeRawPointer , _ count: Int
94
+ ) -> UnsafeRawBufferPointer ? {
95
+ guard cursor + count <= end else { return nil }
96
+ return UnsafeRawBufferPointer ( start: cursor, count: count)
86
97
}
87
98
88
- func tryEat( _ utf8: String . UTF8View ) -> Bool {
89
- for (i, idx) in utf8. indices. enumerated ( ) {
90
- guard load ( offset: i) == utf8 [ idx] else { return false }
91
- }
92
- advance ( utf8. count)
99
+ /// Return the slice of `count` bytes from the current cursor, or `nil` if
100
+ /// there are fewer than `count` bytes until the end of the buffer.
101
+ func slice( _ count: Int ) -> UnsafeRawBufferPointer ? {
102
+ slice ( at: cursor, count)
103
+ }
104
+
105
+ /// Advance the cursor `n` bytes.
106
+ mutating func advanceCursor( _ n: Int = 1 ) {
107
+ cursor += n
108
+ precondition ( cursor <= end, " Cannot advance past end " )
109
+ }
110
+
111
+ /// Check to see if a UTF-8 sequence can be eaten from the current cursor.
112
+ func canEat( _ utf8: String . UTF8View ) -> Bool {
113
+ guard let slice = slice ( utf8. count) else { return false }
114
+ return slice. elementsEqual ( utf8)
115
+ }
116
+
117
+ /// Attempt to eat a UTF-8 byte sequence, returning `true` if successful.
118
+ mutating func tryEat( _ utf8: String . UTF8View ) -> Bool {
119
+ guard canEat ( utf8) else { return false }
120
+ advanceCursor ( utf8. count)
93
121
return true
94
122
}
95
123
96
- // Try to lex the opening delimiter.
97
- guard let delimiter = Delimiter . allCases. first (
98
- where: { tryEat ( $0. opening. utf8) }
99
- ) else {
100
- throw DelimiterLexError ( . unknownDelimiter, resumeAt: current. successor ( ) )
124
+ /// Attempt to eat a particular closing delimiter, returning the contents of
125
+ /// the literal, and ending pointer, or `nil` if this is not a delimiter
126
+ /// ending.
127
+ mutating func tryEatEnding(
128
+ _ delimiter: Delimiter , contentsStart: UnsafeRawPointer
129
+ ) throws -> ( contents: String , end: UnsafeRawPointer ) ? {
130
+ let contentsEnd = cursor
131
+ guard tryEat ( delimiter. closing. utf8) else { return nil }
132
+
133
+ // Form a string from the contents and make sure it's valid UTF-8.
134
+ let count = contentsEnd - contentsStart
135
+ let contents = UnsafeRawBufferPointer (
136
+ start: contentsStart, count: count)
137
+ let s = String ( decoding: contents, as: UTF8 . self)
138
+
139
+ guard s. utf8. elementsEqual ( contents) else {
140
+ throw DelimiterLexError ( . invalidUTF8, resumeAt: cursor)
141
+ }
142
+ return ( contents: s, end: cursor)
101
143
}
102
144
103
- let contentsStart = current
104
- while true {
105
- switch load ( ) {
106
- case nil , ascii ( " \n " ) , ascii ( " \r " ) :
107
- throw DelimiterLexError ( . endOfString, resumeAt: current)
145
+ /// Attempt to advance the lexer, throwing an error if the end of a line or
146
+ /// the end of the buffer is reached.
147
+ mutating func advance( escaped: Bool = false ) throws {
148
+ guard let next = load ( ) else {
149
+ throw DelimiterLexError ( . endOfString, resumeAt: cursor)
150
+ }
151
+ switch UnicodeScalar ( next) {
152
+ case let next where !next. isASCII:
153
+ // Just advance into a UTF-8 sequence. It shouldn't matter that we'll
154
+ // iterate through each byte as we only match against ASCII, and we
155
+ // validate it at the end. This case is separated out so we can just deal
156
+ // with the ASCII cases below.
157
+ advanceCursor ( )
158
+
159
+ case " \n " , " \r " :
160
+ throw DelimiterLexError ( . endOfString, resumeAt: cursor)
161
+
162
+ case " \0 " :
163
+ // TODO: Warn to match the behavior of String literal lexer? Or should
164
+ // we error as unprintable?
165
+ advanceCursor ( )
166
+
167
+ case " \\ " where !escaped:
168
+ // Advance again for an escape sequence.
169
+ advanceCursor ( )
170
+ try advance ( escaped: true )
108
171
109
- case ascii ( " \\ " ) :
110
- // Skip next byte.
111
- advance ( 2 )
112
172
113
173
default :
114
- // Try to lex the closing delimiter.
115
- let contentsEnd = current
116
- guard tryEat ( delimiter. closing. utf8) else {
117
- advance ( )
118
- continue
119
- }
174
+ advanceCursor ( )
175
+ }
176
+ }
120
177
121
- // Form a string from the contents and make sure it's valid UTF-8.
122
- let count = contentsEnd - contentsStart
123
- let contents = UnsafeRawBufferPointer (
124
- start: contentsStart, count: count)
125
- let s = String ( decoding: contents, as: UTF8 . self)
178
+ /*consuming*/ mutating func lex(
179
+ ) throws -> ( contents: String , Delimiter , end: UnsafeRawPointer ) {
180
+
181
+ // Try to lex the opening delimiter.
182
+ guard let delimiter = Delimiter . allCases. first (
183
+ where: { tryEat ( $0. opening. utf8) }
184
+ ) else {
185
+ throw DelimiterLexError ( . unknownDelimiter, resumeAt: cursor. successor ( ) )
186
+ }
126
187
127
- guard s. utf8. elementsEqual ( contents) else {
128
- throw DelimiterLexError ( . invalidUTF8, resumeAt: current)
188
+ let contentsStart = cursor
189
+ while true {
190
+ // Try to lex the closing delimiter.
191
+ if let ( contents, end) = try tryEatEnding ( delimiter,
192
+ contentsStart: contentsStart) {
193
+ return ( contents, delimiter, end)
129
194
}
130
- return ( contents: s, delimiter, end: current)
195
+ // Try to advance the lexer.
196
+ try advance ( )
131
197
}
132
198
}
133
199
}
@@ -151,3 +217,12 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
151
217
}
152
218
fatalError ( " No valid delimiters " )
153
219
}
220
+
221
+ /// Attempt to lex a regex literal between `start` and `end`, returning either
222
+ /// the contents and pointer from which to resume lexing, or an error.
223
+ func lexRegex(
224
+ start: UnsafeRawPointer , end: UnsafeRawPointer
225
+ ) throws -> ( contents: String , Delimiter , end: UnsafeRawPointer ) {
226
+ var lexer = DelimiterLexer ( start: start, end: end)
227
+ return try lexer. lex ( )
228
+ }
0 commit comments