Skip to content

Commit 405fbcb

Browse files
authored
Implement instructions for matching builtin character classes and assertions (#547)
- Adds `matchBuiltin` and adjusts `assertBy` to use a switch in processor instead of taking a generic assertion fn - Adds a `CharacterClass` atom
1 parent 415b080 commit 405fbcb

20 files changed

+785
-604
lines changed

Sources/RegexBuilder/CharacterClass.swift

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,27 +15,39 @@
1515
@available(SwiftStdlib 5.7, *)
1616
public struct CharacterClass {
1717
internal var ccc: DSLTree.CustomCharacterClass
18+
/// The builtin character class, if this CharacterClass is representable by one
19+
internal var builtin: DSLTree.Atom.CharacterClass?
1820

1921
init(_ ccc: DSLTree.CustomCharacterClass) {
2022
self.ccc = ccc
23+
self.builtin = nil
2124
}
2225

23-
init(unconverted atom: DSLTree._AST.Atom) {
24-
self.ccc = .init(members: [.atom(.unconverted(atom))])
26+
init(builtin: DSLTree.Atom.CharacterClass) {
27+
self.ccc = .init(members: [.atom(.characterClass(builtin))])
28+
self.builtin = builtin
2529
}
2630
}
2731

2832
@available(SwiftStdlib 5.7, *)
2933
extension CharacterClass: RegexComponent {
3034
public var regex: Regex<Substring> {
31-
_RegexFactory().customCharacterClass(ccc)
35+
if let cc = builtin {
36+
return _RegexFactory().characterClass(cc)
37+
} else {
38+
return _RegexFactory().customCharacterClass(ccc)
39+
}
3240
}
3341
}
3442

3543
@available(SwiftStdlib 5.7, *)
3644
extension CharacterClass {
3745
public var inverted: CharacterClass {
38-
CharacterClass(ccc.inverted)
46+
if let inv = builtin?.inverted {
47+
return CharacterClass(builtin: inv)
48+
} else {
49+
return CharacterClass(ccc.inverted)
50+
}
3951
}
4052
}
4153

@@ -50,15 +62,15 @@ extension RegexComponent where Self == CharacterClass {
5062
}
5163

5264
public static var anyGraphemeCluster: CharacterClass {
53-
.init(unconverted: ._anyGrapheme)
65+
.init(builtin: .anyGrapheme)
5466
}
5567

5668
public static var whitespace: CharacterClass {
57-
.init(unconverted: ._whitespace)
69+
.init(builtin: .whitespace)
5870
}
5971

6072
public static var digit: CharacterClass {
61-
.init(unconverted: ._digit)
73+
.init(builtin: .digit)
6274
}
6375

6476
public static var hexDigit: CharacterClass {
@@ -70,19 +82,19 @@ extension RegexComponent where Self == CharacterClass {
7082
}
7183

7284
public static var horizontalWhitespace: CharacterClass {
73-
.init(unconverted: ._horizontalWhitespace)
85+
.init(builtin: .horizontalWhitespace)
7486
}
7587

7688
public static var newlineSequence: CharacterClass {
77-
.init(unconverted: ._newlineSequence)
89+
.init(builtin: .newlineSequence)
7890
}
7991

8092
public static var verticalWhitespace: CharacterClass {
81-
.init(unconverted: ._verticalWhitespace)
93+
.init(builtin: .verticalWhitespace)
8294
}
8395

8496
public static var word: CharacterClass {
85-
.init(unconverted: ._word)
97+
.init(builtin: .word)
8698
}
8799
}
88100

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 18 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ fileprivate extension Compiler.ByteCodeGen {
7474
emitMatchScalar(s)
7575
}
7676

77+
case let .characterClass(cc):
78+
emitCharacterClass(cc)
79+
7780
case let .assertion(kind):
7881
try emitAssertion(kind)
7982

@@ -148,147 +151,24 @@ fileprivate extension Compiler.ByteCodeGen {
148151
}
149152
}
150153

151-
mutating func emitStartOfLine() {
152-
builder.buildAssert { [semanticLevel = options.semanticLevel]
153-
(_, _, input, pos, subjectBounds) in
154-
if pos == subjectBounds.lowerBound { return true }
155-
switch semanticLevel {
156-
case .graphemeCluster:
157-
return input[input.index(before: pos)].isNewline
158-
case .unicodeScalar:
159-
return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline
160-
}
161-
}
162-
}
163-
164-
mutating func emitEndOfLine() {
165-
builder.buildAssert { [semanticLevel = options.semanticLevel]
166-
(_, _, input, pos, subjectBounds) in
167-
if pos == subjectBounds.upperBound { return true }
168-
switch semanticLevel {
169-
case .graphemeCluster:
170-
return input[pos].isNewline
171-
case .unicodeScalar:
172-
return input.unicodeScalars[pos].isNewline
173-
}
174-
}
175-
}
176-
177154
mutating func emitAssertion(
178155
_ kind: DSLTree.Atom.Assertion
179156
) throws {
180-
// FIXME: Depends on API model we have... We may want to
181-
// think through some of these with API interactions in mind
182-
//
183-
// This might break how we use `bounds` for both slicing
184-
// and things like `firstIndex`, that is `firstIndex` may
185-
// need to supply both a slice bounds and a per-search bounds.
186-
switch kind {
187-
case .startOfSubject:
188-
builder.buildAssert { (_, _, input, pos, subjectBounds) in
189-
pos == subjectBounds.lowerBound
190-
}
191-
192-
case .endOfSubjectBeforeNewline:
193-
builder.buildAssert { [semanticLevel = options.semanticLevel]
194-
(_, _, input, pos, subjectBounds) in
195-
if pos == subjectBounds.upperBound { return true }
196-
switch semanticLevel {
197-
case .graphemeCluster:
198-
return input.index(after: pos) == subjectBounds.upperBound
199-
&& input[pos].isNewline
200-
case .unicodeScalar:
201-
return input.unicodeScalars.index(after: pos) == subjectBounds.upperBound
202-
&& input.unicodeScalars[pos].isNewline
203-
}
204-
}
205-
206-
case .endOfSubject:
207-
builder.buildAssert { (_, _, input, pos, subjectBounds) in
208-
pos == subjectBounds.upperBound
209-
}
210-
211-
case .resetStartOfMatch:
212-
// FIXME: Figure out how to communicate this out
157+
if kind == .resetStartOfMatch {
213158
throw Unsupported(#"\K (reset/keep assertion)"#)
214-
215-
case .firstMatchingPositionInSubject:
216-
// TODO: We can probably build a nice model with API here
217-
218-
// FIXME: This needs to be based on `searchBounds`,
219-
// not the `subjectBounds` given as an argument here
220-
builder.buildAssert { (_, _, input, pos, subjectBounds) in false }
221-
222-
case .textSegment:
223-
builder.buildAssert { (_, _, input, pos, _) in
224-
// FIXME: Grapheme or word based on options
225-
input.isOnGraphemeClusterBoundary(pos)
226-
}
227-
228-
case .notTextSegment:
229-
builder.buildAssert { (_, _, input, pos, _) in
230-
// FIXME: Grapheme or word based on options
231-
!input.isOnGraphemeClusterBoundary(pos)
232-
}
233-
234-
case .startOfLine:
235-
emitStartOfLine()
236-
237-
case .endOfLine:
238-
emitEndOfLine()
239-
240-
case .caretAnchor:
241-
if options.anchorsMatchNewlines {
242-
emitStartOfLine()
243-
} else {
244-
builder.buildAssert { (_, _, input, pos, subjectBounds) in
245-
pos == subjectBounds.lowerBound
246-
}
247-
}
248-
249-
case .dollarAnchor:
250-
if options.anchorsMatchNewlines {
251-
emitEndOfLine()
252-
} else {
253-
builder.buildAssert { (_, _, input, pos, subjectBounds) in
254-
pos == subjectBounds.upperBound
255-
}
256-
}
257-
258-
case .wordBoundary:
259-
builder.buildAssert { [options]
260-
(cache, maxIndex, input, pos, subjectBounds) in
261-
if options.usesSimpleUnicodeBoundaries {
262-
// TODO: How should we handle bounds?
263-
return _CharacterClassModel.word.isBoundary(
264-
input,
265-
at: pos,
266-
bounds: subjectBounds,
267-
with: options
268-
)
269-
} else {
270-
return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
271-
}
272-
}
273-
274-
case .notWordBoundary:
275-
builder.buildAssert { [options]
276-
(cache, maxIndex, input, pos, subjectBounds) in
277-
if options.usesSimpleUnicodeBoundaries {
278-
// TODO: How should we handle bounds?
279-
return !_CharacterClassModel.word.isBoundary(
280-
input,
281-
at: pos,
282-
bounds: subjectBounds,
283-
with: options
284-
)
285-
} else {
286-
return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
287-
}
288-
}
289159
}
160+
builder.buildAssert(
161+
by: kind,
162+
options.anchorsMatchNewlines,
163+
options.usesSimpleUnicodeBoundaries,
164+
options.usesASCIIWord,
165+
options.semanticLevel)
290166
}
291-
167+
168+
mutating func emitCharacterClass(_ cc: DSLTree.Atom.CharacterClass) {
169+
builder.buildMatchBuiltin(model: cc.asRuntimeModel(options))
170+
}
171+
292172
mutating func emitMatchScalar(_ s: UnicodeScalar) {
293173
assert(options.semanticLevel == .unicodeScalar)
294174
if options.isCaseInsensitive && s.properties.isCased {
@@ -907,10 +787,10 @@ fileprivate extension Compiler.ByteCodeGen {
907787
} else {
908788
builder.buildMatchAsciiBitset(asciiBitset)
909789
}
910-
} else {
911-
let consumer = try ccc.generateConsumer(options)
912-
builder.buildConsume(by: consumer)
790+
return
913791
}
792+
let consumer = try ccc.generateConsumer(options)
793+
builder.buildConsume(by: consumer)
914794
}
915795

916796
mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,8 @@ extension DSLTree.Atom {
162162
case .assertion:
163163
// TODO: We could handle, should this be total?
164164
return nil
165+
case .characterClass(let cc):
166+
return cc.generateConsumer(opts)
165167

166168
case .backreference:
167169
// TODO: Should we handle?
@@ -182,6 +184,15 @@ extension DSLTree.Atom {
182184
}
183185
}
184186

187+
extension DSLTree.Atom.CharacterClass {
188+
func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction {
189+
let model = asRuntimeModel(opts)
190+
return { input, bounds in
191+
model.matches(in: input, at: bounds.lowerBound)
192+
}
193+
}
194+
}
195+
185196
extension String {
186197
/// Compares this string to `other` using the loose matching rule UAX44-LM2,
187198
/// which ignores case, whitespace, underscores, and nearly all medial
@@ -269,16 +280,6 @@ extension AST.Atom {
269280
func generateConsumer(
270281
_ opts: MatchingOptions
271282
) throws -> MEProgram.ConsumeFunction? {
272-
// TODO: Wean ourselves off of this type...
273-
if let cc = self.characterClass?.withMatchLevel(
274-
opts.matchLevel
275-
) {
276-
return { input, bounds in
277-
// FIXME: should we worry about out of bounds?
278-
cc.matches(in: input, at: bounds.lowerBound, with: opts)
279-
}
280-
}
281-
282283
switch kind {
283284
case let .scalar(s):
284285
assertionFailure(
@@ -312,8 +313,11 @@ extension AST.Atom {
312313
case .caretAnchor, .dollarAnchor:
313314
// handled in emitAssertion
314315
return nil
316+
case .escaped:
317+
// handled in emitAssertion and emitCharacterClass
318+
return nil
315319

316-
case .scalarSequence, .escaped, .keyboardControl, .keyboardMeta,
320+
case .scalarSequence, .keyboardControl, .keyboardMeta,
317321
.keyboardMetaControl, .backreference, .subpattern, .callout,
318322
.backtrackingDirective, .changeMatchingOptions, .invalid:
319323
// FIXME: implement

0 commit comments

Comments
 (0)