Skip to content

Commit 12db649

Browse files
authored
Remove most consumer functions (#660)
This is based heavily off the work in #590, rebased onto main, with some changes to remove even more consumer uses. Consumer functions only have two remaining uses: non-ASCII ranges and Unicode lookups (for things like general category, binary properties, name, etc.). This change primarily treats custom character classes as alternations around their contents, with set operations emitted as instructions instead of implemented via consumer function.
1 parent cd0ce57 commit 12db649

File tree

7 files changed

+399
-283
lines changed

7 files changed

+399
-283
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 282 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -256,9 +256,11 @@ fileprivate extension Compiler.ByteCodeGen {
256256
}
257257
}
258258

259-
mutating func emitAlternation(
260-
_ children: [DSLTree.Node]
261-
) throws {
259+
mutating func emitAlternationGen<C: BidirectionalCollection>(
260+
_ elements: C,
261+
withBacktracking: Bool,
262+
_ body: (inout Compiler.ByteCodeGen, C.Element) throws -> Void
263+
) rethrows {
262264
// Alternation: p0 | p1 | ... | pn
263265
// save next_p1
264266
// <code for p0>
@@ -276,16 +278,27 @@ fileprivate extension Compiler.ByteCodeGen {
276278
// <code for pn>
277279
// done:
278280
let done = builder.makeAddress()
279-
for component in children.dropLast() {
281+
for element in elements.dropLast() {
280282
let next = builder.makeAddress()
281283
builder.buildSave(next)
282-
try emitNode(component)
284+
try body(&self, element)
285+
if !withBacktracking {
286+
builder.buildClear()
287+
}
283288
builder.buildBranch(to: done)
284289
builder.label(next)
285290
}
286-
try emitNode(children.last!)
291+
try body(&self, elements.last!)
287292
builder.label(done)
288293
}
294+
295+
mutating func emitAlternation(
296+
_ children: [DSLTree.Node]
297+
) throws {
298+
try emitAlternationGen(children, withBacktracking: true) {
299+
try $0.emitNode($1)
300+
}
301+
}
289302

290303
mutating func emitConcatenationComponent(
291304
_ node: DSLTree.Node
@@ -872,19 +885,187 @@ fileprivate extension Compiler.ByteCodeGen {
872885
}
873886
}
874887

888+
/// Flatten quoted strings into sequences of atoms, so that the standard
889+
/// CCC codegen will handle them.
890+
func flatteningCustomCharacterClassMembers(
891+
_ members: [DSLTree.CustomCharacterClass.Member]
892+
) -> [DSLTree.CustomCharacterClass.Member] {
893+
var characters: Set<Character> = []
894+
var scalars: Set<UnicodeScalar> = []
895+
var result: [DSLTree.CustomCharacterClass.Member] = []
896+
for member in members {
897+
switch member {
898+
case .atom(let atom):
899+
switch atom {
900+
case let .char(char):
901+
characters.insert(char)
902+
case let .scalar(scalar):
903+
scalars.insert(scalar)
904+
default:
905+
result.append(member)
906+
}
907+
case let .quotedLiteral(str):
908+
characters.formUnion(str)
909+
default:
910+
result.append(member)
911+
}
912+
}
913+
result.append(contentsOf: characters.map { .atom(.char($0)) })
914+
result.append(contentsOf: scalars.map { .atom(.scalar($0)) })
915+
return result
916+
}
917+
875918
func coalescingCustomCharacterClass(
876919
_ ccc: DSLTree.CustomCharacterClass
877920
) -> DSLTree.CustomCharacterClass {
878921
// This only needs to be done in grapheme semantic mode. In scalar semantic
879922
// mode, we don't want to coalesce any scalars into a grapheme. This
880923
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
881924
// U+302.
882-
guard options.semanticLevel == .graphemeCluster else { return ccc }
883-
884-
let members = coalescingCustomCharacterClassMembers(ccc.members)
885-
return .init(members: members, isInverted: ccc.isInverted)
925+
let members = options.semanticLevel == .graphemeCluster
926+
? coalescingCustomCharacterClassMembers(ccc.members)
927+
: ccc.members
928+
return .init(
929+
members: flatteningCustomCharacterClassMembers(members),
930+
isInverted: ccc.isInverted)
886931
}
887932

933+
mutating func emitCharacterInCCC(_ c: Character) {
934+
switch options.semanticLevel {
935+
case .graphemeCluster:
936+
emitCharacter(c)
937+
case .unicodeScalar:
938+
// When in scalar mode, act like an alternation of the individual scalars
939+
// that comprise a character.
940+
emitAlternationGen(c.unicodeScalars, withBacktracking: false) {
941+
$0.emitMatchScalar($1)
942+
}
943+
}
944+
}
945+
946+
mutating func emitCCCMember(
947+
_ member: DSLTree.CustomCharacterClass.Member
948+
) throws {
949+
switch member {
950+
case .atom(let atom):
951+
switch atom {
952+
case .char(let c):
953+
emitCharacterInCCC(c)
954+
case .scalar(let s):
955+
emitCharacterInCCC(Character(s))
956+
default:
957+
try emitAtom(atom)
958+
}
959+
case .custom(let ccc):
960+
try emitCustomCharacterClass(ccc)
961+
case .quotedLiteral:
962+
fatalError("Removed in 'flatteningCustomCharacterClassMembers'")
963+
case .range:
964+
let consumer = try member.generateConsumer(options)
965+
builder.buildConsume(by: consumer)
966+
case .trivia:
967+
return
968+
969+
// TODO: Can we decide when it's better to try `rhs` first?
970+
// Intersection is trivial, since failure on either side propagates:
971+
// - store current position
972+
// - lhs
973+
// - restore current position
974+
// - rhs
975+
case let .intersection(lhs, rhs):
976+
let r = builder.makePositionRegister()
977+
builder.buildMoveCurrentPosition(into: r)
978+
try emitCustomCharacterClass(lhs)
979+
builder.buildRestorePosition(from: r)
980+
try emitCustomCharacterClass(rhs)
981+
982+
// TODO: Can we decide when it's better to try `rhs` first?
983+
// For subtraction, failure in `lhs` propagates, while failure in `rhs` is
984+
// swallowed/reversed:
985+
// - store current position
986+
// - lhs
987+
// - save to end
988+
// - restore current position
989+
// - rhs
990+
// - clear, fail (since both succeeded)
991+
// - end: ...
992+
case let .subtraction(lhs, rhs):
993+
let r = builder.makePositionRegister()
994+
let end = builder.makeAddress()
995+
builder.buildMoveCurrentPosition(into: r)
996+
try emitCustomCharacterClass(lhs) // no match here = failure, propagates
997+
builder.buildSave(end)
998+
builder.buildRestorePosition(from: r)
999+
try emitCustomCharacterClass(rhs) // no match here = success, resumes at 'end'
1000+
builder.buildClear() // clears 'end'
1001+
builder.buildFail() // this failure propagates outward
1002+
builder.label(end)
1003+
1004+
// Symmetric difference always requires executing both `rhs` and `lhs`.
1005+
// Execute each, ignoring failure and storing the resulting position in a
1006+
// register. If those results are equal, fail. If they're different, use
1007+
// the position that is different from the starting position:
1008+
// - store current position as r0
1009+
// - save to lhsFail
1010+
// - lhs
1011+
// - clear lhsFail (and continue)
1012+
// - lhsFail: save position as r1
1013+
//
1014+
// - restore current position
1015+
// - save to rhsFail
1016+
// - rhs
1017+
// - clear rhsFail (and continue)
1018+
// - rhsFail: save position as r2
1019+
//
1020+
// - restore to resulting position from lhs (r1)
1021+
// - if equal to r2, goto fail (both sides had same result)
1022+
// - if equal to r0, goto advance (lhs failed)
1023+
// - goto end
1024+
// - advance: restore to resulting position from rhs (r2)
1025+
// - goto end
1026+
// - fail: fail
1027+
// - end: ...
1028+
case let .symmetricDifference(lhs, rhs):
1029+
let r0 = builder.makePositionRegister()
1030+
let r1 = builder.makePositionRegister()
1031+
let r2 = builder.makePositionRegister()
1032+
let lhsFail = builder.makeAddress()
1033+
let rhsFail = builder.makeAddress()
1034+
let advance = builder.makeAddress()
1035+
let fail = builder.makeAddress()
1036+
let end = builder.makeAddress()
1037+
1038+
builder.buildMoveCurrentPosition(into: r0)
1039+
builder.buildSave(lhsFail)
1040+
try emitCustomCharacterClass(lhs)
1041+
builder.buildClear()
1042+
builder.label(lhsFail)
1043+
builder.buildMoveCurrentPosition(into: r1)
1044+
1045+
builder.buildRestorePosition(from: r0)
1046+
builder.buildSave(rhsFail)
1047+
try emitCustomCharacterClass(rhs)
1048+
builder.buildClear()
1049+
builder.label(rhsFail)
1050+
builder.buildMoveCurrentPosition(into: r2)
1051+
1052+
// If r1 == r2, then fail
1053+
builder.buildRestorePosition(from: r1)
1054+
builder.buildCondBranch(to: fail, ifSamePositionAs: r2)
1055+
1056+
// If r1 == r0, then move to r2 before ending
1057+
builder.buildCondBranch(to: advance, ifSamePositionAs: r0)
1058+
builder.buildBranch(to: end)
1059+
builder.label(advance)
1060+
builder.buildRestorePosition(from: r2)
1061+
builder.buildBranch(to: end)
1062+
1063+
builder.label(fail)
1064+
builder.buildFail()
1065+
builder.label(end)
1066+
}
1067+
}
1068+
8881069
mutating func emitCustomCharacterClass(
8891070
_ ccc: DSLTree.CustomCharacterClass
8901071
) throws {
@@ -902,8 +1083,67 @@ fileprivate extension Compiler.ByteCodeGen {
9021083
}
9031084
return
9041085
}
905-
let consumer = try ccc.generateConsumer(options)
906-
builder.buildConsume(by: consumer)
1086+
1087+
let updatedCCC: DSLTree.CustomCharacterClass
1088+
if optimizationsEnabled {
1089+
updatedCCC = ccc.coalescingASCIIMembers(options)
1090+
} else {
1091+
updatedCCC = ccc
1092+
}
1093+
let filteredMembers = updatedCCC.members.filter({!$0.isOnlyTrivia})
1094+
1095+
if updatedCCC.isInverted {
1096+
// inverted
1097+
// custom character class: p0 | p1 | ... | pn
1098+
// Try each member to make sure they all fail
1099+
// save next_p1
1100+
// <code for p0>
1101+
// clear, fail
1102+
// next_p1:
1103+
// save next_p2
1104+
// <code for p1>
1105+
// clear fail
1106+
// next_p2:
1107+
// save next_p...
1108+
// <code for p2>
1109+
// clear fail
1110+
// ...
1111+
// next_pn:
1112+
// save done
1113+
// <code for pn>
1114+
// clear fail
1115+
// done:
1116+
// step forward by 1
1117+
let done = builder.makeAddress()
1118+
for member in filteredMembers.dropLast() {
1119+
let next = builder.makeAddress()
1120+
builder.buildSave(next)
1121+
try emitCCCMember(member)
1122+
builder.buildClear()
1123+
builder.buildFail()
1124+
builder.label(next)
1125+
}
1126+
builder.buildSave(done)
1127+
try emitCCCMember(filteredMembers.last!)
1128+
builder.buildClear()
1129+
builder.buildFail()
1130+
builder.label(done)
1131+
1132+
// Consume a single unit for the inverted ccc
1133+
switch options.semanticLevel {
1134+
case .graphemeCluster:
1135+
builder.buildAdvance(1)
1136+
case .unicodeScalar:
1137+
builder.buildAdvanceUnicodeScalar(1)
1138+
}
1139+
return
1140+
}
1141+
// non inverted CCC
1142+
// Custom character class: p0 | p1 | ... | pn
1143+
// Very similar to alternation, but we don't keep backtracking save points
1144+
try emitAlternationGen(filteredMembers, withBacktracking: false) {
1145+
try $0.emitCCCMember($1)
1146+
}
9071147
}
9081148

9091149
mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {
@@ -1040,6 +1280,12 @@ fileprivate extension Compiler.ByteCodeGen {
10401280
}
10411281

10421282
extension DSLTree.Node {
1283+
/// A Boolean value indicating whether this node advances the match position
1284+
/// on a successful match.
1285+
///
1286+
/// For example, an alternation like `(a|b|c)` always advances the position
1287+
/// by a character, but `(a|b|)` has an empty branch, which matches without
1288+
/// advancing.
10431289
var guaranteesForwardProgress: Bool {
10441290
switch self {
10451291
case .orderedChoice(let children):
@@ -1070,12 +1316,34 @@ extension DSLTree.Node {
10701316
case .consumer, .matcher:
10711317
// Allow zero width consumers and matchers
10721318
return false
1073-
case .customCharacterClass:
1074-
return true
1319+
case .customCharacterClass(let ccc):
1320+
return ccc.guaranteesForwardProgress
10751321
case .quantification(let amount, _, let child):
10761322
let (atLeast, _) = amount.ast.bounds
10771323
return atLeast ?? 0 > 0 && child.guaranteesForwardProgress
10781324
default: return false
10791325
}
10801326
}
10811327
}
1328+
1329+
extension DSLTree.CustomCharacterClass {
1330+
/// We allow trivia into CustomCharacterClass, which could result in a CCC
1331+
/// that matches nothing, ie `(?x)[ ]`.
1332+
var guaranteesForwardProgress: Bool {
1333+
for m in members {
1334+
switch m {
1335+
case .trivia:
1336+
continue
1337+
case let .intersection(lhs, rhs):
1338+
return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress
1339+
case let .subtraction(lhs, _):
1340+
return lhs.guaranteesForwardProgress
1341+
case let .symmetricDifference(lhs, rhs):
1342+
return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress
1343+
default:
1344+
return true
1345+
}
1346+
}
1347+
return false
1348+
}
1349+
}

0 commit comments

Comments
 (0)