Skip to content

Commit 7bdc37d

Browse files
committed
Separate out DelimiterLexing.swift
1 parent 2583463 commit 7bdc37d

File tree

2 files changed

+153
-144
lines changed

2 files changed

+153
-144
lines changed
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
// TODO: mock up multi-line soon
13+
14+
enum Delimiter: Hashable, CaseIterable {
15+
case traditional
16+
case experimental
17+
case reSingleQuote
18+
19+
var openingAndClosing: (opening: String, closing: String) {
20+
switch self {
21+
case .traditional: return ("#/", "/#")
22+
case .experimental: return ("#|", "|#")
23+
case .reSingleQuote: return ("re'", "'")
24+
}
25+
}
26+
var opening: String { openingAndClosing.opening }
27+
var closing: String { openingAndClosing.closing }
28+
29+
/// The default set of syntax options that the delimiter indicates.
30+
var defaultSyntaxOptions: SyntaxOptions {
31+
switch self {
32+
case .traditional, .reSingleQuote:
33+
return .traditional
34+
case .experimental:
35+
return .experimental
36+
}
37+
}
38+
}
39+
40+
struct LexError: Error, CustomStringConvertible {
41+
enum Kind: Hashable {
42+
case endOfString
43+
case invalidUTF8 // TODO: better range reporting
44+
case unknownDelimiter
45+
}
46+
47+
var kind: Kind
48+
49+
/// The pointer at which to resume lexing.
50+
var resumePtr: UnsafeRawPointer
51+
52+
init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) {
53+
self.kind = kind
54+
self.resumePtr = resumePtr
55+
}
56+
57+
var description: String {
58+
switch kind {
59+
case .endOfString: return "unterminated regex literal"
60+
case .invalidUTF8: return "invalid UTF-8 found in source file"
61+
case .unknownDelimiter: return "unknown regex literal delimiter"
62+
}
63+
}
64+
}
65+
66+
/// Attempt to lex a regex literal between `start` and `end`, returning either
67+
/// the contents and pointer from which to resume lexing, or an error.
68+
func lexRegex(
69+
start: UnsafeRawPointer, end: UnsafeRawPointer
70+
) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
71+
precondition(start <= end)
72+
var current = start
73+
74+
func ascii(_ s: Unicode.Scalar) -> UInt8 {
75+
assert(s.value <= 0x7F)
76+
return UInt8(asserting: s.value)
77+
}
78+
func load(offset: Int) -> UInt8? {
79+
guard current + offset < end else { return nil }
80+
return current.load(fromByteOffset: offset, as: UInt8.self)
81+
}
82+
func load() -> UInt8? { load(offset: 0) }
83+
func advance(_ n: Int = 1) {
84+
precondition(current + n <= end, "Cannot advance past end")
85+
current = current.advanced(by: n)
86+
}
87+
88+
func tryEat(_ utf8: String.UTF8View) -> Bool {
89+
for (i, idx) in utf8.indices.enumerated() {
90+
guard load(offset: i) == utf8[idx] else { return false }
91+
}
92+
advance(utf8.count)
93+
return true
94+
}
95+
96+
// Try to lex the opening delimiter.
97+
guard let delimiter = Delimiter.allCases.first(
98+
where: { tryEat($0.opening.utf8) }
99+
) else {
100+
throw LexError(.unknownDelimiter, resumeAt: current.successor())
101+
}
102+
103+
let contentsStart = current
104+
while true {
105+
switch load() {
106+
case nil, ascii("\n"), ascii("\r"):
107+
throw LexError(.endOfString, resumeAt: current)
108+
109+
case ascii("\\"):
110+
// Skip next byte.
111+
advance(2)
112+
113+
default:
114+
// Try to lex the closing delimiter.
115+
let contentsEnd = current
116+
guard tryEat(delimiter.closing.utf8) else {
117+
advance()
118+
continue
119+
}
120+
121+
// Form a string from the contents and make sure it's valid UTF-8.
122+
let count = contentsEnd - contentsStart
123+
let contents = UnsafeRawBufferPointer(
124+
start: contentsStart, count: count)
125+
let s = String(decoding: contents, as: UTF8.self)
126+
127+
guard s.utf8.elementsEqual(contents) else {
128+
throw LexError(.invalidUTF8, resumeAt: current)
129+
}
130+
return (contents: s, delimiter, end: current)
131+
}
132+
}
133+
}
134+
135+
/// Drop a set of regex delimiters from the input string, returning the contents
136+
/// and the delimiter used. The input string must have valid delimiters.
137+
func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
138+
let utf8 = str.utf8
139+
func stripDelimiter(_ delim: Delimiter) -> String? {
140+
let prefix = delim.opening.utf8
141+
let suffix = delim.closing.utf8
142+
guard utf8.prefix(prefix.count).elementsEqual(prefix),
143+
utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil }
144+
145+
return String(utf8.dropFirst(prefix.count).dropLast(suffix.count))
146+
}
147+
for d in Delimiter.allCases {
148+
if let contents = stripDelimiter(d) {
149+
return (contents, d)
150+
}
151+
}
152+
fatalError("No valid delimiters")
153+
}

Sources/_MatchingEngine/Regex/Parse/Mocking.swift

Lines changed: 0 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -9,150 +9,6 @@
99
//
1010
//===----------------------------------------------------------------------===//
1111

12-
13-
// TODO: mock up multi-line soon
14-
15-
enum Delimiter: Hashable, CaseIterable {
16-
case traditional
17-
case experimental
18-
case reSingleQuote
19-
20-
var openingAndClosing: (opening: String, closing: String) {
21-
switch self {
22-
case .traditional: return ("#/", "/#")
23-
case .experimental: return ("#|", "|#")
24-
case .reSingleQuote: return ("re'", "'")
25-
}
26-
}
27-
var opening: String { openingAndClosing.opening }
28-
var closing: String { openingAndClosing.closing }
29-
30-
/// The default set of syntax options that the delimiter indicates.
31-
var defaultSyntaxOptions: SyntaxOptions {
32-
switch self {
33-
case .traditional, .reSingleQuote:
34-
return .traditional
35-
case .experimental:
36-
return .experimental
37-
}
38-
}
39-
}
40-
41-
struct LexError: Error, CustomStringConvertible {
42-
enum Kind: Hashable {
43-
case endOfString
44-
case invalidUTF8 // TODO: better range reporting
45-
case unknownDelimiter
46-
}
47-
48-
var kind: Kind
49-
50-
/// The pointer at which to resume lexing.
51-
var resumePtr: UnsafeRawPointer
52-
53-
init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) {
54-
self.kind = kind
55-
self.resumePtr = resumePtr
56-
}
57-
58-
var description: String {
59-
switch kind {
60-
case .endOfString: return "unterminated regex literal"
61-
case .invalidUTF8: return "invalid UTF-8 found in source file"
62-
case .unknownDelimiter: return "unknown regex literal delimiter"
63-
}
64-
}
65-
}
66-
67-
/// Drop a set of regex delimiters from the input string, returning the contents
68-
/// and the delimiter used. The input string must have valid delimiters.
69-
func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
70-
let utf8 = str.utf8
71-
func stripDelimiter(_ delim: Delimiter) -> String? {
72-
let prefix = delim.opening.utf8
73-
let suffix = delim.closing.utf8
74-
guard utf8.prefix(prefix.count).elementsEqual(prefix),
75-
utf8.suffix(suffix.count).elementsEqual(suffix) else { return nil }
76-
77-
return String(utf8.dropFirst(prefix.count).dropLast(suffix.count))
78-
}
79-
for d in Delimiter.allCases {
80-
if let contents = stripDelimiter(d) {
81-
return (contents, d)
82-
}
83-
}
84-
fatalError("No valid delimiters")
85-
}
86-
87-
/// Attempt to lex a regex literal between `start` and `end`, returning either
88-
/// the contents and pointer from which to resume lexing, or an error.
89-
func lexRegex(
90-
start: UnsafeRawPointer, end: UnsafeRawPointer
91-
) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
92-
precondition(start <= end)
93-
var current = start
94-
95-
func ascii(_ s: Unicode.Scalar) -> UInt8 {
96-
assert(s.value <= 0x7F)
97-
return UInt8(asserting: s.value)
98-
}
99-
func load(offset: Int) -> UInt8? {
100-
guard current + offset < end else { return nil }
101-
return current.load(fromByteOffset: offset, as: UInt8.self)
102-
}
103-
func load() -> UInt8? { load(offset: 0) }
104-
func advance(_ n: Int = 1) {
105-
precondition(current + n <= end, "Cannot advance past end")
106-
current = current.advanced(by: n)
107-
}
108-
109-
func tryEat(_ utf8: String.UTF8View) -> Bool {
110-
for (i, idx) in utf8.indices.enumerated() {
111-
guard load(offset: i) == utf8[idx] else { return false }
112-
}
113-
advance(utf8.count)
114-
return true
115-
}
116-
117-
// Try to lex the opening delimiter.
118-
guard let delimiter = Delimiter.allCases.first(
119-
where: { tryEat($0.opening.utf8) }
120-
) else {
121-
throw LexError(.unknownDelimiter, resumeAt: current.successor())
122-
}
123-
124-
let contentsStart = current
125-
while true {
126-
switch load() {
127-
case nil, ascii("\n"), ascii("\r"):
128-
throw LexError(.endOfString, resumeAt: current)
129-
130-
case ascii("\\"):
131-
// Skip next byte.
132-
advance(2)
133-
134-
default:
135-
// Try to lex the closing delimiter.
136-
let contentsEnd = current
137-
guard tryEat(delimiter.closing.utf8) else {
138-
advance()
139-
continue
140-
}
141-
142-
// Form a string from the contents and make sure it's valid UTF-8.
143-
let count = contentsEnd - contentsStart
144-
let contents = UnsafeRawBufferPointer(
145-
start: contentsStart, count: count)
146-
let s = String(decoding: contents, as: UTF8.self)
147-
148-
guard s.utf8.elementsEqual(contents) else {
149-
throw LexError(.invalidUTF8, resumeAt: current)
150-
}
151-
return (contents: s, delimiter, end: current)
152-
}
153-
}
154-
}
155-
15612
private func copyCString(_ str: String) -> UnsafePointer<CChar> {
15713
let count = str.utf8.count + 1
15814
return str.withCString {

0 commit comments

Comments
 (0)